Improve title, author and subject detection for mobi files

This commit is contained in:
Kovid Goyal 2008-06-20 14:23:56 -07:00
parent e49db6236b
commit 520328c196

View File

@ -44,27 +44,29 @@ class EXTHHeader(object):
self.cover_offset, = struct.unpack('>L', content) self.cover_offset, = struct.unpack('>L', content)
elif id == 202: elif id == 202:
self.thumbnail_offset, = struct.unpack('>L', content) self.thumbnail_offset, = struct.unpack('>L', content)
pos += 3 title = re.search(r'\0+([^\0]+)\0+', raw[pos:])
stop = raw[pos:].find('\x00') if title:
if stop > -1: self.mi.title = title.group(1).decode(codec, 'ignore')
self.mi.title = raw[pos:pos+stop].decode(codec, 'ignore')
def process_metadata(self, id, content, codec): def process_metadata(self, id, content, codec):
if id == 100: if id == 100:
aus = content.split(',') aus = content.split(',')
authors = [] if len(aus) > 0:
for a in aus: self.mi.author_sort = aus[0].decode(codec, 'ignore').strip()
authors.extend(a.split('&')) self.mi.authors = [aus[1].decode(codec, 'ignore').strip()]
self.mi.authors = [i.decode(codec, 'ignore') for i in authors] else:
self.mi.authors = [aus[0].decode(codec, 'ignore').strip()]
elif id == 101: elif id == 101:
self.mi.publisher = content.decode(codec, 'ignore') self.mi.publisher = content.decode(codec, 'ignore').strip()
elif id == 103: elif id == 103:
self.mi.comments = content.decode(codec, 'ignore') self.mi.comments = content.decode(codec, 'ignore')
elif id == 104: elif id == 104:
self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '') self.mi.isbn = content.decode(codec, 'ignore').strip().replace('-', '')
elif id == 105: elif id == 105:
self.mi.category = content.decode(codec, 'ignore') if not self.mi.tags:
self.mi.tags = []
self.mi.tags.append(content.decode(codec, 'ignore'))