From a96c73480d6a014e0b446c5003d773c8c48bb022 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 31 Jan 2011 16:19:47 +0800 Subject: [PATCH] fixed overmatching/substitution issue in italicize function --- src/calibre/ebooks/conversion/utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index d0dc81405b..74afbe7a42 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -159,7 +159,7 @@ class HeuristicProcessor(object): ] for word in ITALICIZE_WORDS: - html = re.sub(r'(?<=\s|>)' + word + r'(?=\s|<)', '%s' % word, html) + html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '%s' % word, html) for pat in ITALICIZE_STYLE_PATS: html = re.sub(pat, lambda mo: '%s' % mo.group('words'), html) @@ -375,8 +375,8 @@ class HeuristicProcessor(object): html = re.sub(ur'\s*\s*', ' ', html) # Delete microsoft 'smart' tags html = re.sub('(?i)', '', html) - # Delete self closing paragraph tags - html = re.sub('', '', html) + # Re-open self closing paragraph tags + html = re.sub('/]*/>', '

', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) @@ -463,7 +463,6 @@ class HeuristicProcessor(object): def __call__(self, html): self.log.debug("********* Heuristic processing HTML *********") - # Count the words in the document to estimate how many chapters to look for and whether # other types of processing are attempted try: @@ -477,7 +476,7 @@ class HeuristicProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = self.arrange_htm_line_endings(html) - + self.dump(html, 'after_arrange_line_endings') if self.cleanup_required(): ###### Check Markup ###### # @@ -580,7 +579,9 @@ class HeuristicProcessor(object): if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) # Center separator lines, use a bit larger margin in this case - html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P((?P(?!\s)\W)\s*(?P=breakchar)?)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) + scene_break = re.compile(r'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*()?\s*()?\s*()?\s*', re.IGNORECASE|re.UNICODE) + print "found "+str(len(scene_break.findall(html)))+" scene breaks" + html = scene_break.sub('

' + '\g' + '

', html) #html = re.sub(']*>\s*

', '

', html) if self.deleted_nbsps: