diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 691aa307d7..6fafbb992e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -245,17 +245,17 @@ class Dehyphenator(object): self.html = html self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'txt': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'(?!<)(?P\w+)(-|‐)\s*(?P\w+)(?![^<]*?>)') + intextmatch = re.compile(u'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)') elif format == 'html_cleanup': - intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') elif format == 'txt_cleanup': - intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)') + intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)(?P\s+)(?P[\w\d]+)') html = intextmatch.sub(self.dehyphenate, html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index c0c2ee8978..d075390e8e 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -34,6 +34,8 @@ class HeuristicProcessor(object): self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) self.scene_break_open = '

' + self.common_in_text_endings = u'[\"\'—’”,\.!\?\…\)„\w]' + self.common_in_text_beginnings = u'[\w\'\"“‘‛]' def is_pdftohtml(self, src): return '' in src[:1000] @@ -638,7 +640,7 @@ class HeuristicProcessor(object): blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) - scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close + scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) # If the user has enabled scene break replacement, then either softbreaks # or 'hard' scene breaks are replaced, depending on which is in use