mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
0c685dcfe0
commit
be5519221e
@ -3,7 +3,7 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re, htmlentitydefs
|
||||||
|
|
||||||
_ascii_pat = None
|
_ascii_pat = None
|
||||||
|
|
||||||
@ -21,3 +21,32 @@ def clean_ascii_chars(txt, charlist=None):
|
|||||||
pat = re.compile(u'|'.join(map(unichr, charlist)))
|
pat = re.compile(u'|'.join(map(unichr, charlist)))
|
||||||
return pat.sub('', txt)
|
return pat.sub('', txt)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
||||||
|
# Removes HTML or XML character references and entities from a text string.
|
||||||
|
#
|
||||||
|
# @param text The HTML (or XML) source text.
|
||||||
|
# @return The plain text, as a Unicode string, if necessary.
|
||||||
|
|
||||||
|
def unescape(text, rm=False, rchar=u''):
|
||||||
|
def fixup(m, rm=rm, rchar=rchar):
|
||||||
|
text = m.group(0)
|
||||||
|
if text[:2] == "&#":
|
||||||
|
# character reference
|
||||||
|
try:
|
||||||
|
if text[:3] == "&#x":
|
||||||
|
return unichr(int(text[3:-1], 16))
|
||||||
|
else:
|
||||||
|
return unichr(int(text[2:-1]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# named entity
|
||||||
|
try:
|
||||||
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
if rm:
|
||||||
|
return rchar #replace by char
|
||||||
|
return text # leave as is
|
||||||
|
return re.sub("&#?\w+;", fixup, text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user