mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
added partial dehyphenation for markdown
This commit is contained in:
parent
0f109d699f
commit
c2cef786ce
@ -201,15 +201,15 @@ class Dehyphenator(object):
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup':
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||
print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
||||
print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
||||
return hyphenated
|
||||
else:
|
||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
||||
print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
@ -230,12 +230,12 @@ class Dehyphenator(object):
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
elif format == 'individual_words_txt':
|
||||
intextmatch = re.compile(u'\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b')
|
||||
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
|
@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin):
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Get length for hyphen removal and punctuation unwrap
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
print "length is "+str(length)
|
||||
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin):
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Get length for hyphen removal and punctuation unwrap
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
print "length is "+str(length)
|
||||
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin):
|
||||
else:
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
||||
print "going through final dehyphenation"
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'txt_cleanup', length)
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
|
Loading…
x
Reference in New Issue
Block a user