mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
attempt to eliminate the general problem of italicize matching things inside of tags, headers, etc
This commit is contained in:
parent
b717749138
commit
b32e608576
@ -148,6 +148,7 @@ class HeuristicProcessor(object):
|
|||||||
return wordcount.words
|
return wordcount.words
|
||||||
|
|
||||||
def markup_italicis(self, html):
|
def markup_italicis(self, html):
|
||||||
|
self.log.debug("\n\n\nitalicize debugging \n\n\n")
|
||||||
ITALICIZE_WORDS = [
|
ITALICIZE_WORDS = [
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||||
@ -156,27 +157,29 @@ class HeuristicProcessor(object):
|
|||||||
]
|
]
|
||||||
|
|
||||||
ITALICIZE_STYLE_PATS = [
|
ITALICIZE_STYLE_PATS = [
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_',
|
ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_',
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/',
|
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~',
|
ur'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~',
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*',
|
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~',
|
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_',
|
ur'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_',
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_',
|
ur'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_',
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*',
|
ur'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*',
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_',
|
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/',
|
ur'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/',
|
||||||
ur'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|',
|
ur'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|',
|
||||||
|
ur'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*',
|
||||||
|
ur'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~',
|
||||||
|
ur'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/',
|
||||||
|
ur'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'
|
||||||
]
|
]
|
||||||
|
|
||||||
for word in ITALICIZE_WORDS:
|
for word in ITALICIZE_WORDS:
|
||||||
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
|
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
|
||||||
|
|
||||||
def sub(mo):
|
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||||
return '<i>%s</i>'%mo.group('words')
|
search_text = re.sub(r'<[^>]*>', '', search_text)
|
||||||
|
|
||||||
for pat in ITALICIZE_STYLE_PATS:
|
for pat in ITALICIZE_STYLE_PATS:
|
||||||
html = re.sub(pat, sub, html)
|
for match in re.finditer(pat, search_text):
|
||||||
|
ital_string = str(match.group('words'))
|
||||||
|
#self.log.debug("italicising "+str(match.group(0))+" with <i>"+ital_string+"</i>")
|
||||||
|
html = re.sub(re.escape(str(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user