From 80cbd6d89f7836f62572e7664d605fc9a6186c09 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 4 Feb 2011 18:48:37 +0800 Subject: [PATCH] slightly better method --- src/calibre/ebooks/conversion/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 397146b415..f541701480 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -28,8 +28,8 @@ class HeuristicProcessor(object): self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - self.multi_blank = re.compile(r'(\s*(]*>\s*)?]*>\s*

(\s*)?(\s*]*>\s*\s*)*){2,}(?!\s*]*>\s*)?]*>\s*

(\s*)?(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*]*>\s*

(\s*]*>\s*\s*)*){2,}(?!\s*]*>\s*

(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) @@ -379,6 +379,8 @@ class HeuristicProcessor(object): html = re.sub('(?i)', '', html) # Re-open self closing paragraph tags html = re.sub('/]*/>', '

', html) + # delete surrounding divs from empty paragraphs + html = re.sub(']*>\s*]*>\s*

\s*', '

', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) @@ -637,6 +639,7 @@ class HeuristicProcessor(object): blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) + self.dump(html, 'before_after_merge_blanks') scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) # If the user has enabled scene break replacement, then either softbreaks