From 439b8c0f213d3b27888086b67619198c0722705f Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 7 Jan 2011 03:40:47 +0800 Subject: [PATCH] delete microsoft smart tags during preprocess --- src/calibre/ebooks/conversion/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6d2d123b10..4bb96ac088 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -234,8 +234,11 @@ class PreProcessor(object): self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles") # remove remaining non-breaking spaces html = re.sub(ur'\u00a0', ' ', html) + # Get rid of various common microsoft specific tags which can cause issues later # Get rid of empty tags to simplify other processing html = re.sub(ur'\s*\s*', ' ', html) + # Delete microsoft 'smart' tags + html = re.sub('(?i)', '', html) # Get rid of empty span, bold, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*\s*){0,2}\s*", " ", html)