mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
Do not choke when reading metadata from MOBI files with incorrectly encoded metadata fields
This commit is contained in:
parent
67e691c2ff
commit
70a22e0327
@ -13,6 +13,7 @@ from calibre.utils.date import parse_date
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language, mobi2iana
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
NULL_INDEX = 0xffffffff
|
||||
@ -31,6 +32,8 @@ class EXTHHeader(object): # {{{
|
||||
self.kf8_header = None
|
||||
self.uuid = self.cdetype = None
|
||||
|
||||
self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))
|
||||
|
||||
while left > 0:
|
||||
left -= 1
|
||||
idx, size = struct.unpack('>LL', raw[pos:pos + 8])
|
||||
@ -66,7 +69,7 @@ class EXTHHeader(object): # {{{
|
||||
# title contains non ASCII chars or non filename safe chars
|
||||
# they are messed up in the PDB header
|
||||
try:
|
||||
title = content.decode(codec)
|
||||
title = self.decode(content)
|
||||
except:
|
||||
pass
|
||||
elif idx == 524: # Lang code
|
||||
@ -80,31 +83,30 @@ class EXTHHeader(object): # {{{
|
||||
#else:
|
||||
# print 'unknown record', idx, repr(content)
|
||||
if title:
|
||||
self.mi.title = replace_entities(title)
|
||||
self.mi.title = replace_entities(clean_ascii_chars(title))
|
||||
|
||||
def process_metadata(self, idx, content, codec):
|
||||
if idx == 100:
|
||||
if self.mi.is_null('authors'):
|
||||
self.mi.authors = []
|
||||
au = content.decode(codec, 'ignore').strip()
|
||||
au = self.decode(content).strip()
|
||||
self.mi.authors.append(au)
|
||||
if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
|
||||
self.mi.author_sort = au.strip()
|
||||
elif idx == 101:
|
||||
self.mi.publisher = content.decode(codec, 'ignore').strip()
|
||||
self.mi.publisher = self.decode(content).strip()
|
||||
if self.mi.publisher in {'Unknown', _('Unknown')}:
|
||||
self.mi.publisher = None
|
||||
elif idx == 103:
|
||||
self.mi.comments = content.decode(codec, 'ignore')
|
||||
self.mi.comments = self.decode(content).strip()
|
||||
elif idx == 104:
|
||||
raw = check_isbn(content.decode(codec, 'ignore').strip().replace('-', ''))
|
||||
raw = check_isbn(self.decode(content).strip().replace('-', ''))
|
||||
if raw:
|
||||
self.mi.isbn = raw
|
||||
elif idx == 105:
|
||||
if not self.mi.tags:
|
||||
self.mi.tags = []
|
||||
self.mi.tags.extend([x.strip() for x in content.decode(codec,
|
||||
'ignore').split(';')])
|
||||
self.mi.tags.extend([x.strip() for x in self.decode(content).split(';')])
|
||||
self.mi.tags = list(set(self.mi.tags))
|
||||
elif idx == 106:
|
||||
try:
|
||||
@ -112,7 +114,7 @@ class EXTHHeader(object): # {{{
|
||||
except:
|
||||
pass
|
||||
elif idx == 108:
|
||||
self.mi.book_producer = content.decode(codec, 'ignore').strip()
|
||||
self.mi.book_producer = self.decode(content).strip()
|
||||
elif idx == 112: # dc:source set in some EBSP amazon samples
|
||||
try:
|
||||
content = content.decode(codec).strip()
|
||||
|
Loading…
x
Reference in New Issue
Block a user