diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e87a8021f9..3b1239814a 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -106,6 +106,52 @@ def line_length(format, raw, percent):
return lengths[index]
+class Dehyphenator(object):
+ '''
+ Analyzes words to determine whether hyphens should be retained/removed. Uses the document
+ itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
+ scientific words. The primary disadvantage is that words appearing only once in the document
+ retain hyphens.
+ '''
+
+ def __init__(self):
+ # Add common suffixes to the regex below to increase the likelihood of a match -
+ # don't add suffixes which are also complete words, such as 'able' or 'sex'
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ # remove prefixes if the prefix was not already the point of hyphenation
+ self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
+ self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+
+ def dehyphenate(self, match):
+ firsthalf = match.group('firstpart')
+ secondhalf = match.group('secondpart')
+ hyphenated = str(firsthalf) + "-" + str(secondhalf)
+ dehyphenated = str(firsthalf) + str(secondhalf)
+ lookupword = self.removesuffixes.sub('', dehyphenated)
+ if self.prefixes.match(firsthalf) is None:
+ lookupword = self.removeprefix.sub('', lookupword)
+ booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+ #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ match = booklookup.search(self.html)
+ if match:
+ #print "returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ else:
+ #print "returned hyphenated word: " + str(hyphenated)
+ return hyphenated
+
+ def __call__(self, html, format, length=1):
+ self.html = html
+ if format == 'html':
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length)
+ elif format == 'pdf':
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ elif format == 'individual_words':
+ intextmatch = re.compile('>[^<]*\b(?P[^"\s>]+)-(?P\s*(?=[[a-z\d])'), lambda match: ''))
+ # unwrap em/en dashes
+ end_rules.append((re.compile(u'(?<=[–—])\s*\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
@@ -350,7 +395,7 @@ class HTMLPreProcessor(object):
# print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?(i|b|u)>)?\s*(
\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
@@ -380,6 +425,11 @@ class HTMLPreProcessor(object):
for rule in rules + end_rules:
html = rule[0].sub(rule[1], html)
+ if is_pdftohtml:
+ # Dehyphenate
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'pdf', length)
+
#dump(html, 'post-preprocess')
# Handle broken XHTML w/ SVG (ugh)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 37fd169cb1..6a5eaa4a34 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
import re
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
from calibre.utils.logging import default_log
class PreProcessor(object):
@@ -114,7 +114,7 @@ class PreProcessor(object):
html = re.sub(ur'\s*\s*', ' ', html)
# Get rid of empty span, bold, & italics tags
html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
- html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*[ibu]>\s*){0,2}\s*[ibu]>", " ", html)
+ html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*[ibu]>\s*){0,2}\s*[ibu]>", " ", html)
html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
@@ -132,7 +132,6 @@ class PreProcessor(object):
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*
", "\n", html)
html = re.sub(r"\s*\s*", "\n
", html)
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
@@ -140,16 +139,16 @@ class PreProcessor(object):
#
# Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*([ibu]>){0,2}\s*()?\s*((p|/?br)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*([ibu]>){0,2}\s*()?\s*((p|/?br)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
###### Unwrap lines ######
@@ -174,10 +173,16 @@ class PreProcessor(object):
length = line_length(format, html, getattr(self.extra_opts,
'html_unwrap_factor', 0.4))
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
+ max_length = length * 1.4
+ min_max = str("(?<=.{"+str(length)+"})(?\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
- html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+ html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
+ # Dehyphenate
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html', length)
# Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
@@ -186,7 +191,7 @@ class PreProcessor(object):
# If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10:
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
- chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter