From 90878e844b82c12b629a6f81210272c261357d99 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 7 Feb 2011 18:46:30 +0800 Subject: [PATCH 1/2] fix for text based horizontal rules in dehyphenate and scene break markup --- src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------ src/calibre/ebooks/conversion/utils.py | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 691aa307d7..6fafbb992e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -245,17 +245,17 @@ class Dehyphenator(object): self.html = html self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'txt': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\W\-]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'(?!<)(?P\w+)(-|‐)\s*(?P\w+)(?![^<]*?>)') + intextmatch = re.compile(u'(?!<)(?P[^\W\-]+)(-|‐)\s*(?P\w+)(?![^<]*?>)') elif format == 'html_cleanup': - intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') elif format == 'txt_cleanup': - intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)') + intextmatch = re.compile(u'(?P[^\W\-]+)(-|‐)(?P\s+)(?P[\w\d]+)') html = intextmatch.sub(self.dehyphenate, html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index c0c2ee8978..6583c258bf 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -34,6 +34,7 @@ class HeuristicProcessor(object): self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) self.scene_break_open = '

' + self.common_in_text_endings = u'[\"\'—’”,\.!\?\…)\w]' def is_pdftohtml(self, src): return '' in src[:1000] @@ -638,7 +639,7 @@ class HeuristicProcessor(object): blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) - scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close + scene_break_regex = self.line_open+'(?!([\w\'\"]|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) # If the user has enabled scene break replacement, then either softbreaks # or 'hard' scene breaks are replaced, depending on which is in use From cdcfde662562105ebb8948828d9ddf37e3dccbf1 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 7 Feb 2011 19:11:47 +0800 Subject: [PATCH 2/2] added more line beginnings --- src/calibre/ebooks/conversion/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6583c258bf..d075390e8e 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -34,7 +34,8 @@ class HeuristicProcessor(object): self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) self.scene_break_open = '

' - self.common_in_text_endings = u'[\"\'—’”,\.!\?\…)\w]' + self.common_in_text_endings = u'[\"\'—’”,\.!\?\…\)„\w]' + self.common_in_text_beginnings = u'[\w\'\"“‘‛]' def is_pdftohtml(self, src): return '' in src[:1000] @@ -639,7 +640,7 @@ class HeuristicProcessor(object): blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) - scene_break_regex = self.line_open+'(?!([\w\'\"]|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close + scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) # If the user has enabled scene break replacement, then either softbreaks # or 'hard' scene breaks are replaced, depending on which is in use