diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6583c258bf..d075390e8e 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -34,7 +34,8 @@ class HeuristicProcessor(object): self.line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>" self.single_blank = re.compile(r'(\s*
]*>\s*
)', re.IGNORECASE) self.scene_break_open = ''
- self.common_in_text_endings = u'[\"\'—’”,\.!\?\…)\w]'
+ self.common_in_text_endings = u'[\"\'—’”,\.!\?\…\)„\w]'
+ self.common_in_text_beginnings = u'[\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '' in src[:1000]
@@ -639,7 +640,7 @@ class HeuristicProcessor(object):
blanks_count = len(self.any_multi_blank.findall(html))
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
- scene_break_regex = self.line_open+'(?!([\w\'\"]|.*?'+self.common_in_text_endings+'<))(?P