Fix #5336 (Text within angle brackets disappears in .mobi books)

This commit is contained in:
Kovid Goyal 2010-04-22 07:54:21 -06:00
parent eefa1d58c2
commit a6945bdadb
2 changed files with 19 additions and 10 deletions

View File

@ -399,38 +399,42 @@ def my_unichr(num):
except ValueError: except ValueError:
return u'?' return u'?'
def entity_to_unicode(match, exceptions=[], encoding='cp1252'): def entity_to_unicode(match, exceptions=[], encoding='cp1252',
result_exceptions={}):
''' '''
@param match: A match object such that '&'+match.group(1)';' is the entity. @param match: A match object such that '&'+match.group(1)';' is the entity.
@param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234' @param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
@param encoding: The encoding to use to decode numeric entities between 128 and 256. @param encoding: The encoding to use to decode numeric entities between 128 and 256.
If None, the Unicode UCS encoding is used. A common encoding is cp1252. If None, the Unicode UCS encoding is used. A common encoding is cp1252.
''' '''
def check(ch):
return result_exceptions.get(ch, ch)
ent = match.group(1) ent = match.group(1)
if ent in exceptions: if ent in exceptions:
return '&'+ent+';' return '&'+ent+';'
if ent == 'apos': if ent == 'apos':
return "'" return check("'")
if ent == 'hellips': if ent == 'hellips':
ent = 'hellip' ent = 'hellip'
if ent.startswith(u'#x'): if ent.lower().startswith(u'#x'):
num = int(ent[2:], 16) num = int(ent[2:], 16)
if encoding is None or num > 255: if encoding is None or num > 255:
return my_unichr(num) return check(my_unichr(num))
return chr(num).decode(encoding) return check(chr(num).decode(encoding))
if ent.startswith(u'#'): if ent.startswith(u'#'):
try: try:
num = int(ent[1:]) num = int(ent[1:])
except ValueError: except ValueError:
return '&'+ent+';' return '&'+ent+';'
if encoding is None or num > 255: if encoding is None or num > 255:
return my_unichr(num) return check(my_unichr(num))
try: try:
return chr(num).decode(encoding) return check(chr(num).decode(encoding))
except UnicodeDecodeError: except UnicodeDecodeError:
return my_unichr(num) return check(my_unichr(num))
try: try:
return my_unichr(name2codepoint[ent]) return check(my_unichr(name2codepoint[ent]))
except KeyError: except KeyError:
return '&'+ent+';' return '&'+ent+';'

View File

@ -303,7 +303,12 @@ class MobiReader(object):
for pat in ENCODING_PATS: for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html) self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode, e2u = functools.partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp', 'apos', 'quot', '#60', '#62']) result_exceptions={
'<' : u'&lt;',
'>' : u'&gt;',
'&' : u'&amp;',
'"' : u'&quot;',
"'" : u'&apos;'})
self.processed_html = re.sub(r'&(\S+?);', e2u, self.processed_html = re.sub(r'&(\S+?);', e2u,
self.processed_html) self.processed_html)
self.extract_images(processed_records, output_dir) self.extract_images(processed_records, output_dir)