From 60c50f39442b09872fb5aeb98a3be2bea3f4ec56 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 16 Jan 2011 01:46:04 +0800 Subject: [PATCH 1/2] tied mobi into preprocess --- src/calibre/ebooks/conversion/utils.py | 5 +++-- src/calibre/ebooks/mobi/input.py | 11 ++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 305346d496..9825585cbf 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -236,7 +236,7 @@ class PreProcessor(object): print unicode(self.chapters_with_title)+" chapters with titles" else: html = chapdetect.sub(self.chapter_head, html) - return html + return html recurse_patterns(html, True) html = recurse_patterns(html, False) @@ -322,7 +322,8 @@ class PreProcessor(object): html = re.sub(ur'\s*\s*', ' ', html) # Delete microsoft 'smart' tags html = re.sub('(?i)', '', html) - # Get rid of empty span, bold, & italics tags + # Get rid of empty span, bold, font, & italics tags + html = re.sub(r'\s*]*>\s*\s*', '', html) html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*\s*){0,2}\s*", " ", html) html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 584be71fe4..4f3a087065 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en' import re from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.conversion.utils import PreProcessor class MOBIInput(InputFormatPlugin): @@ -40,10 +41,6 @@ class MOBIInput(InputFormatPlugin): return mr.created_opf_path def heuristics(self, options, html): - # search for places where a first or second level heading is immediately followed by another - # top level heading. demote the second heading to h3 to prevent splitting between chapter - # headings and titles, images, etc - doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) - html = doubleheading.sub('\g'+'\n'+'', html) - return html - + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) + return preprocessor(html) From d354a085b8e06f3283231a18fecbf2ee775f52bd Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 16 Jan 2011 01:53:49 +0800 Subject: [PATCH 2/2] ... --- src/calibre/ebooks/mobi/input.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 4f3a087065..8188027e01 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -5,7 +5,6 @@ __docformat__ = 'restructuredtext en' import re from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.utils import PreProcessor class MOBIInput(InputFormatPlugin): @@ -40,7 +39,3 @@ class MOBIInput(InputFormatPlugin): accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path - def heuristics(self, options, html): - self.options = options - preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) - return preprocessor(html)