added partial dehyphenation for markdown

This commit is contained in:
ldolse 2011-01-09 19:34:02 +08:00
parent 0f109d699f
commit c2cef786ce
2 changed files with 22 additions and 16 deletions

View File

@ -201,15 +201,15 @@ class Dehyphenator(object):
searchresult = self.html.find(lookupword.lower()) searchresult = self.html.find(lookupword.lower())
except: except:
return hyphenated return hyphenated
if self.format == 'html_cleanup': if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated) print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated return dehyphenated
elif self.html.find(hyphenated) != -1: elif self.html.find(hyphenated) != -1:
#print "Cleanup:returned hyphenated word: " + str(hyphenated) print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated return hyphenated
else: else:
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf return firsthalf+u'\u2014'+wraptags+secondhalf
else: else:
@ -230,12 +230,12 @@ class Dehyphenator(object):
elif format == 'txt': elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'individual_words_txt':
intextmatch = re.compile(u'\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)\u0020*(?P<secondpart>\w+)\b')
elif format == 'html_cleanup': elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
elif format == 'txt_cleanup':
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html

View File

@ -74,6 +74,14 @@ class TXTInput(InputFormatPlugin):
if options.preserve_spaces: if options.preserve_spaces:
txt = preserve_spaces(txt) txt = preserve_spaces(txt)
# Normalize line endings
txt = normalize_line_endings(txt)
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
print "length is "+str(length)
if options.formatting_type == 'auto': if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt) options.formatting_type = detect_formatting_type(txt)
@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin):
else: else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type) log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Normalize line endings
txt = normalize_line_endings(txt)
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
print "length is "+str(length)
# Dehyphenate # Dehyphenate
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
txt = dehyphenator(txt,'txt', length) txt = dehyphenator(txt,'txt', length)
@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin):
else: else:
html = convert_basic(txt, epub_split_size_kb=flow_size) html = convert_basic(txt, epub_split_size_kb=flow_size)
# Dehyphenate in cleanup mode for missed txt and markdown conversion
print "going through final dehyphenation"
dehyphenator = Dehyphenator()
html = dehyphenator(html,'txt_cleanup', length)
html = dehyphenator(html,'html_cleanup', length)
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html') html_input = plugin_for_input_format('html')
for opt in html_input.options: for opt in html_input.options: