mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
made em-dash unwrapping line length dependent, as sometimes it's used as an ellipsis alternative
This commit is contained in:
parent
e303babf89
commit
301af532c6
@ -117,7 +117,7 @@ class Dehyphenator(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||||
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
# remove prefixes if the prefix was not already the point of hyphenation
|
||||||
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
|
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
|
||||||
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
|
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
|
||||||
@ -374,10 +374,8 @@ class HTMLPreProcessor(object):
|
|||||||
print 'Failed to parse remove_footer regexp'
|
print 'Failed to parse remove_footer regexp'
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
# unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
|
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
# unwrap em/en dashes
|
|
||||||
end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
|
||||||
# unwrap/delete soft hyphens
|
# unwrap/delete soft hyphens
|
||||||
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
@ -397,6 +395,8 @@ class HTMLPreProcessor(object):
|
|||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
# unwrap em/en dashes
|
||||||
|
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user