From 070df88d209b99ce404822673492d6d34d789772 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 4 Feb 2011 18:06:13 +0800 Subject: [PATCH 1/3] modified blank line handling to better handle lrf formats and preserve scene breaks --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 16ef4c86e2..397146b415 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -28,8 +28,8 @@ class HeuristicProcessor(object): self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}(?!\s*]*>\s*

){2,}', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*(]*>\s*)?]*>\s*

(\s*)?(\s*]*>\s*\s*)*){2,}(?!\s*]*>\s*)?]*>\s*

(\s*)?(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) From 80cbd6d89f7836f62572e7664d605fc9a6186c09 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 4 Feb 2011 18:48:37 +0800 Subject: [PATCH 2/3] slightly better method --- src/calibre/ebooks/conversion/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 397146b415..f541701480 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -28,8 +28,8 @@ class HeuristicProcessor(object): self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - self.multi_blank = re.compile(r'(\s*(]*>\s*)?]*>\s*

(\s*)?(\s*]*>\s*\s*)*){2,}(?!\s*]*>\s*)?]*>\s*

(\s*)?(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*]*>\s*

(\s*]*>\s*\s*)*){2,}(?!\s*]*>\s*

(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) @@ -379,6 +379,8 @@ class HeuristicProcessor(object): html = re.sub('(?i)', '', html) # Re-open self closing paragraph tags html = re.sub('/]*/>', '

', html) + # delete surrounding divs from empty paragraphs + html = re.sub(']*>\s*]*>\s*

\s*', '

', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) @@ -637,6 +639,7 @@ class HeuristicProcessor(object): blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) + self.dump(html, 'before_after_merge_blanks') scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) # If the user has enabled scene break replacement, then either softbreaks From 84431ab4ddf9d7cd93a8e04d1ce19a14524091a2 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 4 Feb 2011 20:34:07 +0800 Subject: [PATCH 3/3] ... --- src/calibre/ebooks/conversion/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index f541701480..a9f733277a 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -379,13 +379,13 @@ class HeuristicProcessor(object): html = re.sub('(?i)', '', html) # Re-open self closing paragraph tags html = re.sub('/]*/>', '

', html) - # delete surrounding divs from empty paragraphs - html = re.sub(']*>\s*]*>\s*

\s*', '

', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) + # delete surrounding divs from empty paragraphs + html = re.sub(']*>\s*]*>\s*

\s*', '

', html) # Empty heading tags html = re.sub(r'(?i)\s*', '', html) self.deleted_nbsps = True @@ -563,7 +563,6 @@ class HeuristicProcessor(object): # Determine whether the document uses interleaved blank lines self.blanks_between_paragraphs = self.analyze_blanks(html) - #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic if getattr(self.extra_opts, 'markup_chapter_headings', False): @@ -639,7 +638,6 @@ class HeuristicProcessor(object): blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) - self.dump(html, 'before_after_merge_blanks') scene_break_regex = self.line_open+'(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) # If the user has enabled scene break replacement, then either softbreaks