Heuristic processing: Fix the italicize common patterns algorithm breaking on some HTML markup. Fixes #922317 (Private bug)

This commit is contained in:
Kovid Goyal 2012-01-28 16:43:25 +05:30
parent 565fc2d479
commit bf034f4c5b

View File

@ -157,7 +157,7 @@ class HeuristicProcessor(object):
ITALICIZE_STYLE_PATS = [
ur'(?msu)(?<=[\s>"\'])_(?P<words>[^_]+)_',
ur'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*>]+)/',
ur'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*><]+)/',
ur'(?msu)(?<=[\s>"\'])~~(?P<words>[^~]+)~~',
ur'(?msu)(?<=[\s>"\'])\*(?P<words>[^\*]+)\*',
ur'(?msu)(?<=[\s>"\'])~(?P<words>[^~]+)~',
@ -172,8 +172,11 @@ class HeuristicProcessor(object):
for word in ITALICIZE_WORDS:
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
def sub(mo):
return '<i>%s</i>'%mo.group('words')
for pat in ITALICIZE_STYLE_PATS:
html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
html = re.sub(pat, sub, html)
return html