diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 03a0047927..a1e28b2554 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -106,6 +106,50 @@ def line_length(format, raw, percent): return lengths[index] +class Dehyphenator(object): + ''' + Analyzes words to determine whether hyphens should be retained/removed. Uses the document + itself is as a dictionary. This method handles all languages along with uncommon, made-up, and + scientific words. The primary disadvantage is that words appearing only once in the document + retain hyphens. + ''' + + def dehyphenate(self, match): + firsthalf = match.group('firstpart') + secondhalf = match.group('secondpart') + hyphenated = str(firsthalf) + "-" + str(secondhalf) + dehyphenated = str(firsthalf) + str(secondhalf) + # Add common suffixes to the regex below to increase the likelihood of a match - + # don't add suffixes which are also complete words, such as 'able' or 'sex' + removesuffixes = re.compile(r"((ed)?ly|(')?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + lookupword = removesuffixes.sub('', dehyphenated) + # remove prefixes if the prefix was not already the point of hyphenation + prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) + removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) + if prefixes.match(firsthalf) is None: + lookupword = removeprefix.sub('', lookupword) + booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) + #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + match = booklookup.search(self.html) + if match: + #print "returned dehyphenated word: " + str(dehyphenated) + return dehyphenated + else: + #print "returned hyphenated word: " + str(hyphenated) + return hyphenated + + def __call__(self, html, format, length=1): + self.html = html + if format == 'html': + intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) + elif format == 'pdf': + intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + elif format == 'individual_words': + intextmatch = re.compile('>[^<]*\b(?P[^"\s>]+)-(?P\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap em/en dashes + end_rules.append((re.compile(u'(?<=[–—])\s*

\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens end_rules.append((re.compile(u'[­](\s*

)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting @@ -350,7 +393,7 @@ class HTMLPreProcessor(object): # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -380,6 +423,11 @@ class HTMLPreProcessor(object): for rule in rules + end_rules: html = rule[0].sub(rule[1], html) + if is_pdftohtml: + # Dehyphenate + dehyphenator = Dehyphenator() + html = dehyphenator(html,'pdf', length) + #dump(html, 'post-preprocess') # Handle broken XHTML w/ SVG (ugh) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 37fd169cb1..f9178ead0b 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' import re -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator from calibre.utils.logging import default_log class PreProcessor(object): @@ -132,7 +132,6 @@ class PreProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*

\s*", "\n

", html) - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) @@ -174,10 +173,16 @@ class PreProcessor(object): length = line_length(format, html, getattr(self.extra_opts, 'html_unwrap_factor', 0.4)) self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") + max_length = length * 1.4 + min_max = str("(?<=.{"+str(length)+"})(?\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) - html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html) + # Dehyphenate + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html', length) # Unwrap lines using punctation and line length unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)