Heuristics: When italicizing words do not operate on words not in between HTML tags. Fixes #986298 (Heuristics breaking id="_anchor_")

This commit is contained in:
Kovid Goyal 2012-04-24 18:23:44 +05:30
commit 5e3e063b49

View File

@ -148,6 +148,7 @@ class HeuristicProcessor(object):
return wordcount.words
def markup_italicis(self, html):
self.log.debug("\n\n\nitalicize debugging \n\n\n")
ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
@ -156,27 +157,29 @@ class HeuristicProcessor(object):
]
ITALICIZE_STYLE_PATS = [
ur'(?msu)(?<=[\s>"\'])_(?P<words>[^_]+)_',
ur'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*><]+)/',
ur'(?msu)(?<=[\s>"\'])_\*/(?P<words>[^\*_]+)/\*_',
ur'(?msu)(?<=[\s>"\'])~~(?P<words>[^~]+)~~',
ur'(?msu)(?<=[\s>"\'])\*(?P<words>[^\*]+)\*',
ur'(?msu)(?<=[\s>"\'])~(?P<words>[^~]+)~',
ur'(?msu)(?<=[\s>"\'])_/(?P<words>[^/_]+)/_',
ur'(?msu)(?<=[\s>"\'])_\*(?P<words>[^\*_]+)\*_',
ur'(?msu)(?<=[\s>"\'])\*/(?P<words>[^/\*]+)/\*',
ur'(?msu)(?<=[\s>"\'])_\*/(?P<words>[^\*_]+)/\*_',
ur'(?msu)(?<=[\s>"\'])/:(?P<words>[^:/]+):/',
ur'(?msu)(?<=[\s>"\'])\|:(?P<words>[^:\|]+):\|',
ur'(?msu)(?<=[\s>"\'])\*(?P<words>[^\*]+)\*',
ur'(?msu)(?<=[\s>"\'])~(?P<words>[^~]+)~',
ur'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*><]+)/',
ur'(?msu)(?<=[\s>"\'])_(?P<words>[^_]+)_'
]
for word in ITALICIZE_WORDS:
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
def sub(mo):
return '<i>%s</i>'%mo.group('words')
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
search_text = re.sub(r'<[^>]*>', '', search_text)
for pat in ITALICIZE_STYLE_PATS:
html = re.sub(pat, sub, html)
for match in re.finditer(pat, search_text):
ital_string = str(match.group('words'))
#self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
return html