From c2cef786ce19b25cbdfc79c345d4cffa38885248 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 19:34:02 +0800 Subject: [PATCH] added partial dehyphenation for markdown --- src/calibre/ebooks/conversion/preprocess.py | 16 +++++++-------- src/calibre/ebooks/txt/input.py | 22 +++++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d9d735e391..e2c51846a4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -201,15 +201,15 @@ class Dehyphenator(object): searchresult = self.html.find(lookupword.lower()) except: return hyphenated - if self.format == 'html_cleanup': + if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: - #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + print "Cleanup:returned dehyphenated word: " + str(dehyphenated) return dehyphenated elif self.html.find(hyphenated) != -1: - #print "Cleanup:returned hyphenated word: " + str(hyphenated) + print "Cleanup:returned hyphenated word: " + str(hyphenated) return hyphenated else: - #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) return firsthalf+u'\u2014'+wraptags+secondhalf else: @@ -230,12 +230,12 @@ class Dehyphenator(object): elif format == 'txt': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet - elif format == 'individual_words_txt': - intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b') - + intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + elif format == 'txt_cleanup': + intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)') + html = intextmatch.sub(self.dehyphenate, html) return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 2e35e8e345..5fbdc7131a 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin): # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) + + # Normalize line endings + txt = normalize_line_endings(txt) + + # Get length for hyphen removal and punctuation unwrap + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + print "length is "+str(length) if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) @@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin): else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) - # Normalize line endings - txt = normalize_line_endings(txt) - - # Get length for hyphen removal and punctuation unwrap - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) - print "length is "+str(length) - # Dehyphenate dehyphenator = Dehyphenator() txt = dehyphenator(txt,'txt', length) @@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin): else: html = convert_basic(txt, epub_split_size_kb=flow_size) + # Dehyphenate in cleanup mode for missed txt and markdown conversion + print "going through final dehyphenation" + dehyphenator = Dehyphenator() + html = dehyphenator(html,'txt_cleanup', length) + html = dehyphenator(html,'html_cleanup', length) + from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: