From 9bbff15c27c2be0b6101f17ddaa7f53a504824ea Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 02:12:09 +0800
Subject: [PATCH 1/9] text processing tweaks

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 src/calibre/ebooks/txt/input.py        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 27dacdf5fb..52d1bcc619 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -190,7 +190,7 @@ class PreProcessor(object):
         line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
         line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
-        txt_line_wrap = u"(\u0020|\u0009)*\n"
+        txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
         
         unwrap_regex = lookahead+line_ending+blanklines+line_opening
         if format == 'txt':
@@ -357,6 +357,6 @@ class PreProcessor(object):
         html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
 
         # Center separator lines
-        html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
+        html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
 
         return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 98756c5fa1..eac46385a7 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
             
             # We don't check for block because the processor assumes block.
             # single and print at transformed to block for processing.
-            if options.paragraph_type == 'single' or 'unformatted':
+            if options.paragraph_type in ('single', 'unformatted'):
                 txt = separate_paragraphs_single_line(txt)
             elif options.paragraph_type == 'print':
                 txt = separate_paragraphs_print_formatted(txt)

From e9130241603a99f7e8dddfb8ff7df6edf4faacb5 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 10:40:30 +0800
Subject: [PATCH 2/9] ...

---
 src/calibre/ebooks/txt/input.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index e2405de617..34a702cc55 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -23,7 +23,7 @@ class TXTInput(InputFormatPlugin):
 
     options = set([
         OptionRecommendation(name='paragraph_type', recommended_value='auto',
-            choices=['auto', 'block', 'single', 'print'],
+            choices=['auto', 'block', 'single', 'print', 'unformatted'],
             help=_('Paragraph structure.\n'
                    'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
                    '* auto: Try to auto detect paragraph type.\n'

From 289cdf33925dc4f80c08889e941becc9c3862471 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 10:43:24 +0800
Subject: [PATCH 3/9] changed unformatted description

---
 src/calibre/ebooks/txt/input.py     | 2 +-
 src/calibre/ebooks/txt/processor.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 34a702cc55..9bc9323a4c 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -31,7 +31,7 @@ class TXTInput(InputFormatPlugin):
                    '* single: Assume every line is a paragraph.\n'
                    '* print:  Assume every line starting with 2+ spaces or a tab '
                    'starts a paragraph.'
-                   '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
+                   '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
         OptionRecommendation(name='formatting_type', recommended_value='auto',
             choices=['auto', 'none', 'heuristic', 'markdown'],
             help=_('Formatting used within the document.'
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 9dc29e45dd..e26f0a9d07 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -117,7 +117,7 @@ def detect_paragraph_type(txt):
     single: Each line is a paragraph.
     print: Each paragraph starts with a 2+ spaces or a tab
            and ends when a new paragraph is reached.
-    unformatted: most lines have hard line breaks, few/no spaces or indents
+    unformatted: most lines have hard line breaks, few/no blank lines or indents
     
     returns block, single, print, unformatted
     '''

From f3a9f3f83f7da4821bdc1fca2ba0df66aca714e1 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 17:27:24 +0800
Subject: [PATCH 4/9] added dehyphenation to txt input

---
 src/calibre/ebooks/conversion/preprocess.py | 15 +++++++++++----
 src/calibre/ebooks/txt/input.py             | 18 ++++++++++++------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ae111355e4..df9fd66407 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,6 +72,8 @@ class DocAnalysis(object):
 
     def __init__(self, format='html', raw=''):
         raw = raw.replace('&nbsp;', ' ')
+        raw = raw.replace('\r\n', '\n')
+        raw = raw.replace('\r', '\n')
         if format == 'html':
             linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
         elif format == 'pdf':
@@ -79,7 +81,7 @@ class DocAnalysis(object):
         elif format == 'spanned_html':
             linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
         elif format == 'txt':
-            linere = re.compile('.*?\n', re.DOTALL)
+            linere = re.compile('.*?\n')
         self.lines = linere.findall(raw)
 
     def line_length(self, percent):
@@ -177,7 +179,7 @@ class Dehyphenator(object):
     def __init__(self):
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
         self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
         self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@@ -194,7 +196,7 @@ class Dehyphenator(object):
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
             lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
         try:
             searchresult = self.html.find(lookupword.lower())
         except:
@@ -225,8 +227,13 @@ class Dehyphenator(object):
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
         elif format == 'pdf':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+        elif format == 'txt':
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+        elif format == 'individual_words_txt':
+            intextmatch = re.compile(u'\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b')
+
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
 
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 9bc9323a4c..f6adb617c3 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
 import os
 
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
@@ -91,8 +92,16 @@ class TXTInput(InputFormatPlugin):
                     log.debug('Could not reliably determine paragraph type using block')
                     options.paragraph_type = 'block'
                 else:
-                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type) 
-            
+                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+            # Get length for hyphen removal and punctuation unwrap
+            docanalysis = DocAnalysis('txt', txt)
+            length = docanalysis.line_length(.5)
+
+            # Dehyphenate
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(txt,'txt', length)
+
             # We don't check for block because the processor assumes block.
             # single and print at transformed to block for processing.
 
@@ -103,10 +112,8 @@ class TXTInput(InputFormatPlugin):
 
             if options.paragraph_type == 'unformatted':
                 from calibre.ebooks.conversion.utils import PreProcessor
-                from calibre.ebooks.conversion.preprocess import DocAnalysis
                 # get length
-                docanalysis = DocAnalysis('txt', txt)
-                length = docanalysis.line_length(.5)
+
                 # unwrap lines based on punctuation
                 preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
                 txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
@@ -117,7 +124,6 @@ class TXTInput(InputFormatPlugin):
                 html = convert_heuristic(txt, epub_split_size_kb=flow_size)
             else:
                 html = convert_basic(txt, epub_split_size_kb=flow_size)
-            
 
         from calibre.customize.ui import plugin_for_input_format
         html_input = plugin_for_input_format('html')

From 696d9252324a5fa31ae91f8a3c5d472b5d5d953c Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 18:14:49 +0800
Subject: [PATCH 5/9] normalized line endings to simplify line length and
 dehyphenation, fixes print formatted output for certain line endings

---
 src/calibre/ebooks/conversion/preprocess.py | 10 +++++-----
 src/calibre/ebooks/txt/input.py             |  8 ++++++--
 src/calibre/ebooks/txt/processor.py         |  5 ++++-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index df9fd66407..d9d735e391 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,8 +72,8 @@ class DocAnalysis(object):
 
     def __init__(self, format='html', raw=''):
         raw = raw.replace('&nbsp;', ' ')
-        raw = raw.replace('\r\n', '\n')
-        raw = raw.replace('\r', '\n')
+        #raw = raw.replace('\r\n', '\n')
+        #raw = raw.replace('\r', '\n')
         if format == 'html':
             linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
         elif format == 'pdf':
@@ -214,10 +214,10 @@ class Dehyphenator(object):
 
         else:
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                print "returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                print "           returned hyphenated word: " + str(hyphenated)
                 return hyphenated
 
     def __call__(self, html, format, length=1):
@@ -228,7 +228,7 @@ class Dehyphenator(object):
         elif format == 'pdf':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
         elif format == 'txt':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
             intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
         elif format == 'individual_words_txt':
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index f6adb617c3..2e35e8e345 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic
+    convert_heuristic, normalize_line_endings
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
                 else:
                     log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
 
+            # Normalize line endings
+            txt = normalize_line_endings(txt)
+
             # Get length for hyphen removal and punctuation unwrap
             docanalysis = DocAnalysis('txt', txt)
             length = docanalysis.line_length(.5)
+            print "length is "+str(length)
 
             # Dehyphenate
             dehyphenator = Dehyphenator()
-            html = dehyphenator(txt,'txt', length)
+            txt = dehyphenator(txt,'txt', length)
 
             # We don't check for block because the processor assumes block.
             # single and print at transformed to block for processing.
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e26f0a9d07..ebdadebda2 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
           safe_mode=False)
     return HTML_TEMPLATE % (title, md.convert(txt))
 
-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
     txt = txt.replace('\r\n', '\n')
     txt = txt.replace('\r', '\n')
+    return txt
+
+def separate_paragraphs_single_line(txt):
     txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
     return txt
 

From 0f109d699f06967394370150a0a35bf671a283c6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 18:38:52 +0800
Subject: [PATCH 6/9] tweaked the auto-detection to handle cases where the vast
 majority of the lines are formatted as block or print

---
 src/calibre/ebooks/txt/processor.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index ebdadebda2..6a1a106681 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -133,15 +133,21 @@ def detect_paragraph_type(txt):
     hardbreaks = docanalysis.line_histogram(.55)
     
     if hardbreaks:
-        # Check for print
+        # Determine print percentage
         tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
-        if tab_line_count / float(txt_line_count) >= .15:
-            return 'print'
-        
-        # Check for block
+        print_percent = tab_line_count / float(txt_line_count)
+     
+        # Determine block percentage
         empty_line_count = len(re.findall('(?mu)^\s*$', txt))
-        if empty_line_count / float(txt_line_count) >= .15:
-            return 'block'
+        block_percent = empty_line_count / float(txt_line_count)
+        
+        # Compare the two types - the type with the larger number of instances wins
+        # in cases where only one or the other represents the vast majority of the document neither wins
+        if print_percent >= block_percent:
+            if .15 <= print_percent <= .75:
+                return 'print'
+        elif .15 <= block_percent <= .75:
+            return 'block'     
 
         # Assume unformatted text with hardbreaks if nothing else matches        
         return 'unformatted'

From c2cef786ce19b25cbdfc79c345d4cffa38885248 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 19:34:02 +0800
Subject: [PATCH 7/9] added partial dehyphenation for markdown

---
 src/calibre/ebooks/conversion/preprocess.py | 16 +++++++--------
 src/calibre/ebooks/txt/input.py             | 22 +++++++++++++--------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d9d735e391..e2c51846a4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -201,15 +201,15 @@ class Dehyphenator(object):
             searchresult = self.html.find(lookupword.lower())
         except:
             return hyphenated
-        if self.format == 'html_cleanup':
+        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                print "Cleanup:returned hyphenated word: " + str(hyphenated)
                 return hyphenated
             else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
                 return firsthalf+u'\u2014'+wraptags+secondhalf
 
         else:
@@ -230,12 +230,12 @@ class Dehyphenator(object):
         elif format == 'txt':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
-        elif format == 'individual_words_txt':
-            intextmatch = re.compile(u'\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b')
-
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+        elif format == 'txt_cleanup':
+            intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
+
 
         html = intextmatch.sub(self.dehyphenate, html)
         return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 2e35e8e345..5fbdc7131a 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin):
         # followed by the &nbsp; entity.
         if options.preserve_spaces:
             txt = preserve_spaces(txt)
+
+        # Normalize line endings
+        txt = normalize_line_endings(txt)
+
+        # Get length for hyphen removal and punctuation unwrap
+        docanalysis = DocAnalysis('txt', txt)
+        length = docanalysis.line_length(.5)
+        print "length is "+str(length)
             
         if options.formatting_type == 'auto':
             options.formatting_type = detect_formatting_type(txt)
@@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin):
                 else:
                     log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
 
-            # Normalize line endings
-            txt = normalize_line_endings(txt)
-
-            # Get length for hyphen removal and punctuation unwrap
-            docanalysis = DocAnalysis('txt', txt)
-            length = docanalysis.line_length(.5)
-            print "length is "+str(length)
-
             # Dehyphenate
             dehyphenator = Dehyphenator()
             txt = dehyphenator(txt,'txt', length)
@@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin):
             else:
                 html = convert_basic(txt, epub_split_size_kb=flow_size)
 
+        # Dehyphenate in cleanup mode for missed txt and markdown conversion
+        print "going through final dehyphenation"
+        dehyphenator = Dehyphenator()
+        html = dehyphenator(html,'txt_cleanup', length)
+        html = dehyphenator(html,'html_cleanup', length)
+
         from calibre.customize.ui import plugin_for_input_format
         html_input = plugin_for_input_format('html')
         for opt in html_input.options:

From 9751f99db95185a9a6cdf66029f1d46e4a9d90d8 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 19:57:15 +0800
Subject: [PATCH 8/9] cleaned up print statements

---
 src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------
 src/calibre/ebooks/txt/input.py             |  2 --
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e2c51846a4..32eee713fe 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -196,28 +196,28 @@ class Dehyphenator(object):
         lookupword = self.removesuffixes.sub('', dehyphenated)
         if self.prefixes.match(firsthalf) is None:
             lookupword = self.removeprefix.sub('', lookupword)
-        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
         try:
             searchresult = self.html.find(lookupword.lower())
         except:
             return hyphenated
         if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             elif self.html.find(hyphenated) != -1:
-                print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
                 return hyphenated
             else:
-                print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
                 return firsthalf+u'\u2014'+wraptags+secondhalf
 
         else:
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                print "returned dehyphenated word: " + str(dehyphenated)
+                #print "returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             else:
-                print "           returned hyphenated word: " + str(hyphenated)
+                #print "           returned hyphenated word: " + str(hyphenated)
                 return hyphenated
 
     def __call__(self, html, format, length=1):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5fbdc7131a..3957391494 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -80,7 +80,6 @@ class TXTInput(InputFormatPlugin):
         # Get length for hyphen removal and punctuation unwrap
         docanalysis = DocAnalysis('txt', txt)
         length = docanalysis.line_length(.5)
-        print "length is "+str(length)
             
         if options.formatting_type == 'auto':
             options.formatting_type = detect_formatting_type(txt)
@@ -130,7 +129,6 @@ class TXTInput(InputFormatPlugin):
                 html = convert_basic(txt, epub_split_size_kb=flow_size)
 
         # Dehyphenate in cleanup mode for missed txt and markdown conversion
-        print "going through final dehyphenation"
         dehyphenator = Dehyphenator()
         html = dehyphenator(html,'txt_cleanup', length)
         html = dehyphenator(html,'html_cleanup', length)

From 7008e9b64cbe98ca43e77965a84a3f5af4e88f6d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 21:56:12 +0800
Subject: [PATCH 9/9] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 32eee713fe..08a46cb8d9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,8 +72,6 @@ class DocAnalysis(object):
 
     def __init__(self, format='html', raw=''):
         raw = raw.replace('&nbsp;', ' ')
-        #raw = raw.replace('\r\n', '\n')
-        #raw = raw.replace('\r', '\n')
         if format == 'html':
             linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
         elif format == 'pdf':