diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index acfa80e877..2c1a5cd4d3 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -148,6 +148,7 @@ class HeuristicProcessor(object): return wordcount.words def markup_italicis(self, html): + self.log.debug("\n\n\nitalicize debugging \n\n\n") ITALICIZE_WORDS = [ 'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.', 'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.', @@ -156,28 +157,30 @@ class HeuristicProcessor(object): ] ITALICIZE_STYLE_PATS = [ - ur'(?msu)(?<=[\s>"“\'‘])_(?P[^_]+)_', - ur'(?msu)(?<=[\s>"“\'‘])/(?P[^/\*><]+)/', + ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P[^\*_]+)/\*_', ur'(?msu)(?<=[\s>"“\'‘])~~(?P[^~]+)~~', - ur'(?msu)(?<=[\s>"“\'‘])\*(?P[^\*]+)\*', - ur'(?msu)(?<=[\s>"“\'‘])~(?P[^~]+)~', ur'(?msu)(?<=[\s>"“\'‘])_/(?P[^/_]+)/_', ur'(?msu)(?<=[\s>"“\'‘])_\*(?P[^\*_]+)\*_', ur'(?msu)(?<=[\s>"“\'‘])\*/(?P[^/\*]+)/\*', - ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P[^\*_]+)/\*_', ur'(?msu)(?<=[\s>"“\'‘])/:(?P[^:/]+):/', ur'(?msu)(?<=[\s>"“\'‘])\|:(?P[^:\|]+):\|', + ur'(?msu)(?<=[\s>"“\'‘])\*(?P[^\*]+)\*', + ur'(?msu)(?<=[\s>"“\'‘])~(?P[^~]+)~', + ur'(?msu)(?<=[\s>"“\'‘])/(?P[^/\*><]+)/', + ur'(?msu)(?<=[\s>"“\'‘])_(?P[^_]+)_' ] for word in ITALICIZE_WORDS: html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '%s' % word, html) - def sub(mo): - return '%s'%mo.group('words') - + search_text = re.sub(r'(?s)]*>.*?', '', html) + search_text = re.sub(r'<[^>]*>', '', search_text) for pat in ITALICIZE_STYLE_PATS: - html = re.sub(pat, sub, html) - + for match in re.finditer(pat, search_text): + ital_string = str(match.group('words')) + #self.log.debug("italicising "+str(match.group(0))+" with "+ital_string+"") + html = re.sub(re.escape(str(match.group(0))), '%s' % ital_string, html) + return html def markup_chapters(self, html, wordcount, blanks_between_paragraphs):