mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #5336 (Text within angle brackets disappears in .mobi books)
This commit is contained in:
parent
eefa1d58c2
commit
a6945bdadb
@ -399,38 +399,42 @@ def my_unichr(num):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return u'?'
|
return u'?'
|
||||||
|
|
||||||
def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
|
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
||||||
|
result_exceptions={}):
|
||||||
'''
|
'''
|
||||||
@param match: A match object such that '&'+match.group(1)';' is the entity.
|
@param match: A match object such that '&'+match.group(1)';' is the entity.
|
||||||
@param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
|
@param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
|
||||||
@param encoding: The encoding to use to decode numeric entities between 128 and 256.
|
@param encoding: The encoding to use to decode numeric entities between 128 and 256.
|
||||||
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
|
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
|
||||||
'''
|
'''
|
||||||
|
def check(ch):
|
||||||
|
return result_exceptions.get(ch, ch)
|
||||||
|
|
||||||
ent = match.group(1)
|
ent = match.group(1)
|
||||||
if ent in exceptions:
|
if ent in exceptions:
|
||||||
return '&'+ent+';'
|
return '&'+ent+';'
|
||||||
if ent == 'apos':
|
if ent == 'apos':
|
||||||
return "'"
|
return check("'")
|
||||||
if ent == 'hellips':
|
if ent == 'hellips':
|
||||||
ent = 'hellip'
|
ent = 'hellip'
|
||||||
if ent.startswith(u'#x'):
|
if ent.lower().startswith(u'#x'):
|
||||||
num = int(ent[2:], 16)
|
num = int(ent[2:], 16)
|
||||||
if encoding is None or num > 255:
|
if encoding is None or num > 255:
|
||||||
return my_unichr(num)
|
return check(my_unichr(num))
|
||||||
return chr(num).decode(encoding)
|
return check(chr(num).decode(encoding))
|
||||||
if ent.startswith(u'#'):
|
if ent.startswith(u'#'):
|
||||||
try:
|
try:
|
||||||
num = int(ent[1:])
|
num = int(ent[1:])
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return '&'+ent+';'
|
return '&'+ent+';'
|
||||||
if encoding is None or num > 255:
|
if encoding is None or num > 255:
|
||||||
return my_unichr(num)
|
return check(my_unichr(num))
|
||||||
try:
|
try:
|
||||||
return chr(num).decode(encoding)
|
return check(chr(num).decode(encoding))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
return my_unichr(num)
|
return check(my_unichr(num))
|
||||||
try:
|
try:
|
||||||
return my_unichr(name2codepoint[ent])
|
return check(my_unichr(name2codepoint[ent]))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return '&'+ent+';'
|
return '&'+ent+';'
|
||||||
|
|
||||||
|
@ -303,7 +303,12 @@ class MobiReader(object):
|
|||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
self.processed_html = pat.sub('', self.processed_html)
|
self.processed_html = pat.sub('', self.processed_html)
|
||||||
e2u = functools.partial(entity_to_unicode,
|
e2u = functools.partial(entity_to_unicode,
|
||||||
exceptions=['lt', 'gt', 'amp', 'apos', 'quot', '#60', '#62'])
|
result_exceptions={
|
||||||
|
'<' : u'<',
|
||||||
|
'>' : u'>',
|
||||||
|
'&' : u'&',
|
||||||
|
'"' : u'"',
|
||||||
|
"'" : u'''})
|
||||||
self.processed_html = re.sub(r'&(\S+?);', e2u,
|
self.processed_html = re.sub(r'&(\S+?);', e2u,
|
||||||
self.processed_html)
|
self.processed_html)
|
||||||
self.extract_images(processed_records, output_dir)
|
self.extract_images(processed_records, output_dir)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user