From a96c73480d6a014e0b446c5003d773c8c48bb022 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 31 Jan 2011 16:19:47 +0800
Subject: [PATCH] fixed overmatching/substitution issue in italicize function
---
src/calibre/ebooks/conversion/utils.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index d0dc81405b..74afbe7a42 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -159,7 +159,7 @@ class HeuristicProcessor(object):
]
for word in ITALICIZE_WORDS:
- html = re.sub(r'(?<=\s|>)' + word + r'(?=\s|<)', '%s' % word, html)
+ html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '%s' % word, html)
for pat in ITALICIZE_STYLE_PATS:
html = re.sub(pat, lambda mo: '%s' % mo.group('words'), html)
@@ -375,8 +375,8 @@ class HeuristicProcessor(object):
html = re.sub(ur'\s*\s*', ' ', html)
# Delete microsoft 'smart' tags
html = re.sub('(?i)?st1:\w+>', '', html)
- # Delete self closing paragraph tags
- html = re.sub('', '', html)
+ # Re-open self closing paragraph tags
+ html = re.sub('/]*/>', '
', html)
# Get rid of empty span, bold, font, em, & italics tags
html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html)
html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*(font|[ibu]|em|strong)>\s*){0,2}\s*(font|[ibu]|em|strong)>", " ", html)
@@ -463,7 +463,6 @@ class HeuristicProcessor(object):
def __call__(self, html):
self.log.debug("********* Heuristic processing HTML *********")
-
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
try:
@@ -477,7 +476,7 @@ class HeuristicProcessor(object):
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = self.arrange_htm_line_endings(html)
-
+ self.dump(html, 'after_arrange_line_endings')
if self.cleanup_required():
###### Check Markup ######
#
@@ -580,7 +579,9 @@ class HeuristicProcessor(object):
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
# Center separator lines, use a bit larger margin in this case
- html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P((?P(?!\s)\W)\s*(?P=breakchar)?)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
+ scene_break = re.compile(r'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', re.IGNORECASE|re.UNICODE)
+ print "found "+str(len(scene_break.findall(html)))+" scene breaks"
+ html = scene_break.sub('' + '\g' + '
', html)
#html = re.sub(']*>\s*
', '
', html)
if self.deleted_nbsps: