mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Improved algorithm for removal of hyphens during pre-processing
This commit is contained in:
commit
4dd60160c4
@ -106,6 +106,52 @@ def line_length(format, raw, percent):
|
||||
|
||||
return lengths[index]
|
||||
|
||||
class Dehyphenator(object):
|
||||
'''
|
||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
||||
retain hyphens.
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
secondhalf = match.group('secondpart')
|
||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
if self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
|
||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||
match = booklookup.search(self.html)
|
||||
if match:
|
||||
#print "returned dehyphenated word: " + str(dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
#print "returned hyphenated word: " + str(hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
self.html = html
|
||||
if format == 'html':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
|
||||
|
||||
class CSSPreProcessor(object):
|
||||
|
||||
@ -328,11 +374,10 @@ class HTMLPreProcessor(object):
|
||||
print 'Failed to parse remove_footer regexp'
|
||||
traceback.print_exc()
|
||||
|
||||
# unwrap hyphenation - moved here so it's executed after header/footer removal
|
||||
# unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap visible dashes and hyphens - don't delete they are often hyphens for
|
||||
# for compound words, formatting, etc
|
||||
end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
@ -350,7 +395,7 @@ class HTMLPreProcessor(object):
|
||||
# print "The pdf line length returned is " + str(length)
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
@ -380,6 +425,11 @@ class HTMLPreProcessor(object):
|
||||
for rule in rules + end_rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
|
||||
if is_pdftohtml:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'pdf', length)
|
||||
|
||||
#dump(html, 'post-preprocess')
|
||||
|
||||
# Handle broken XHTML w/ SVG (ugh)
|
||||
|
@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from calibre.ebooks.conversion.preprocess import line_length
|
||||
from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
|
||||
class PreProcessor(object):
|
||||
@ -114,7 +114,7 @@ class PreProcessor(object):
|
||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||
# Get rid of empty span, bold, & italics tags
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||
@ -132,7 +132,6 @@ class PreProcessor(object):
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
||||
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
@ -140,16 +139,16 @@ class PreProcessor(object):
|
||||
#
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
if self.html_preprocess_sections < 10:
|
||||
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
|
||||
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
if self.html_preprocess_sections < 10:
|
||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||
html = chapdetect2.sub(self.chapter_head, html)
|
||||
|
||||
if self.html_preprocess_sections < 10:
|
||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||
html = chapdetect2.sub(self.chapter_head, html)
|
||||
|
||||
###### Unwrap lines ######
|
||||
@ -174,10 +173,16 @@ class PreProcessor(object):
|
||||
length = line_length(format, html, getattr(self.extra_opts,
|
||||
'html_unwrap_factor', 0.4))
|
||||
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
||||
max_length = length * 1.4
|
||||
min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})")
|
||||
#
|
||||
# Unwrap and/or delete soft-hyphens, hyphens
|
||||
# Unwrap em/en dashes, delete soft-hyphens
|
||||
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||
html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'html', length)
|
||||
|
||||
# Unwrap lines using punctation and line length
|
||||
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
@ -186,7 +191,7 @@ class PreProcessor(object):
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < 10:
|
||||
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
# search for places where a first or second level heading is immediately followed by another
|
||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||
|
Loading…
x
Reference in New Issue
Block a user