mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
enabled hyphen removal across the entire document text, refactored logic to reduce false positives, added verbose debug output
This commit is contained in:
parent
66e1b8a27e
commit
1272988089
@ -174,13 +174,19 @@ class Dehyphenator(object):
|
|||||||
retain hyphens.
|
retain hyphens.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, verbose=0, log=None):
|
||||||
|
self.log = default_log if log is None else log
|
||||||
|
self.verbose = verbose
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
|
# only remove if it's not already the point of hyphenation
|
||||||
|
self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
|
||||||
|
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||||
|
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
|
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||||
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
|
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||||
|
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||||
|
|
||||||
def dehyphenate(self, match):
|
def dehyphenate(self, match):
|
||||||
firsthalf = match.group('firstpart')
|
firsthalf = match.group('firstpart')
|
||||||
@ -191,31 +197,44 @@ class Dehyphenator(object):
|
|||||||
wraptags = ''
|
wraptags = ''
|
||||||
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
|
||||||
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||||
|
if self.suffixes.match(secondhalf) is None:
|
||||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
if self.prefixes.match(firsthalf) is None:
|
else:
|
||||||
|
lookupword = dehyphenated
|
||||||
|
if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
|
||||||
lookupword = self.removeprefix.sub('', lookupword)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
|
||||||
try:
|
try:
|
||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
elif self.html.find(hyphenated) != -1:
|
elif self.html.find(hyphenated) != -1:
|
||||||
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
else:
|
else:
|
||||||
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
|
if self.verbose > 2:
|
||||||
|
self.log(" Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
|
||||||
return firsthalf+u'\u2014'+wraptags+secondhalf
|
return firsthalf+u'\u2014'+wraptags+secondhalf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||||
|
if self.verbose > 2:
|
||||||
|
self.log("too short, returned hyphenated word: " + str(hyphenated))
|
||||||
|
return hyphenated
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned dehyphenated word: " + str(dehyphenated))
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
else:
|
else:
|
||||||
#print " returned hyphenated word: " + str(hyphenated)
|
if self.verbose > 2:
|
||||||
|
self.log(" returned hyphenated word: " + str(hyphenated))
|
||||||
return hyphenated
|
return hyphenated
|
||||||
|
|
||||||
def __call__(self, html, format, length=1):
|
def __call__(self, html, format, length=1):
|
||||||
@ -228,7 +247,7 @@ class Dehyphenator(object):
|
|||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
|
intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)') # for later, not called anywhere yet
|
||||||
elif format == 'html_cleanup':
|
elif format == 'html_cleanup':
|
||||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||||
elif format == 'txt_cleanup':
|
elif format == 'txt_cleanup':
|
||||||
@ -512,7 +531,7 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
if is_pdftohtml and length > -1:
|
if is_pdftohtml and length > -1:
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
|
@ -322,11 +322,11 @@ class HeuristicProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
# Get rid of empty span, bold, font, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
|
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
self.deleted_nbsps = True
|
self.deleted_nbsps = True
|
||||||
return html
|
return html
|
||||||
|
|
||||||
@ -376,17 +376,20 @@ class HeuristicProcessor(object):
|
|||||||
except:
|
except:
|
||||||
self.log("Can't get wordcount")
|
self.log("Can't get wordcount")
|
||||||
|
|
||||||
if 0 < self.totalwords < 50:
|
print "found "+unicode(self.totalwords)+" words in the flow"
|
||||||
|
if self.totalwords < 50:
|
||||||
self.log("flow is too short, not running heuristics")
|
self.log("flow is too short, not running heuristics")
|
||||||
return html
|
return html
|
||||||
|
|
||||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
html = self.arrange_htm_line_endings(html)
|
html = self.arrange_htm_line_endings(html)
|
||||||
|
|
||||||
|
if self.cleanup_required():
|
||||||
###### Check Markup ######
|
###### Check Markup ######
|
||||||
#
|
#
|
||||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
# <pre> tags), check and mark up line endings if required before proceeding
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
|
# fix indents must run after this step
|
||||||
if self.no_markup(html, 0.1):
|
if self.no_markup(html, 0.1):
|
||||||
self.log("not enough paragraph markers, adding now")
|
self.log("not enough paragraph markers, adding now")
|
||||||
# markup using text processing
|
# markup using text processing
|
||||||
@ -397,6 +400,7 @@ class HeuristicProcessor(object):
|
|||||||
html = self.fix_nbsp_indents(html)
|
html = self.fix_nbsp_indents(html)
|
||||||
|
|
||||||
if self.cleanup_required():
|
if self.cleanup_required():
|
||||||
|
# fix indents must run before this step, as it removes non-breaking spaces
|
||||||
html = self.cleanup_markup(html)
|
html = self.cleanup_markup(html)
|
||||||
|
|
||||||
# ADE doesn't render <br />, change to empty paragraphs
|
# ADE doesn't render <br />, change to empty paragraphs
|
||||||
@ -421,8 +425,6 @@ class HeuristicProcessor(object):
|
|||||||
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
html = self.blankreg.sub('', html)
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
###### Unwrap lines ######
|
|
||||||
if getattr(self.extra_opts, 'unwrap_lines', False):
|
|
||||||
# Determine line ending type
|
# Determine line ending type
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
@ -440,6 +442,8 @@ class HeuristicProcessor(object):
|
|||||||
length = docanalysis.line_length(unwrap_factor)
|
length = docanalysis.line_length(unwrap_factor)
|
||||||
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||||
|
|
||||||
|
###### Unwrap lines ######
|
||||||
|
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||||
if hardbreaks or unwrap_factor < 0.4:
|
if hardbreaks or unwrap_factor < 0.4:
|
||||||
self.log("Unwrapping required, unwrapping Lines")
|
self.log("Unwrapping required, unwrapping Lines")
|
||||||
@ -447,15 +451,16 @@ class HeuristicProcessor(object):
|
|||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
html = self.punctuation_unwrap(length, html, 'html')
|
html = self.punctuation_unwrap(length, html, 'html')
|
||||||
#check any remaining hyphens, but only unwrap if there is a match
|
# unwrap remaining hyphens based on line length, but only remove if there is a match
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'dehyphenate', False):
|
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||||
self.log("Fixing hyphenated content")
|
self.log("Fixing hyphenated content")
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
html = dehyphenator(html, 'individual_words', length)
|
||||||
|
|
||||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
|
@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||||
|
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
|
||||||
txt = dehyphenator(txt,'txt', length)
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
setattr(options, 'dehyphenate', True)
|
setattr(options, 'dehyphenate', True)
|
||||||
|
|
||||||
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
|
||||||
html = dehyphenator(html,'txt_cleanup', length)
|
html = dehyphenator(html,'txt_cleanup', length)
|
||||||
html = dehyphenator(html,'html_cleanup', length)
|
html = dehyphenator(html,'html_cleanup', length)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user