mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Recognize azw3 books in archives, and put mutiple authors in multiple exth records in mobi files, following kindlegen
This commit is contained in:
commit
78bcb88231
@ -163,7 +163,7 @@ def render_html(path_to_html, width=590, height=750, as_xhtml=True):
|
|||||||
|
|
||||||
def check_ebook_format(stream, current_guess):
|
def check_ebook_format(stream, current_guess):
|
||||||
ans = current_guess
|
ans = current_guess
|
||||||
if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1'):
|
if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1', 'azw3'):
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
if stream.read(3) == 'TPZ':
|
if stream.read(3) == 'TPZ':
|
||||||
ans = 'tpz'
|
ans = 'tpz'
|
||||||
|
@ -70,7 +70,7 @@ class ArchiveExtract(FileTypePlugin):
|
|||||||
fname = fnames[0]
|
fname = fnames[0]
|
||||||
ext = os.path.splitext(fname)[1][1:]
|
ext = os.path.splitext(fname)[1][1:]
|
||||||
if ext.lower() not in ('lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf',
|
if ext.lower() not in ('lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf',
|
||||||
'mp3', 'pdb', 'azw', 'azw1', 'fb2'):
|
'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2'):
|
||||||
return archive
|
return archive
|
||||||
|
|
||||||
of = self.temporary_file('_archive_extract.'+ext)
|
of = self.temporary_file('_archive_extract.'+ext)
|
||||||
|
@ -15,7 +15,7 @@ _METADATA_PRIORITIES = [
|
|||||||
'html', 'htm', 'xhtml', 'xhtm',
|
'html', 'htm', 'xhtml', 'xhtm',
|
||||||
'rtf', 'fb2', 'pdf', 'prc', 'odt',
|
'rtf', 'fb2', 'pdf', 'prc', 'odt',
|
||||||
'epub', 'lit', 'lrx', 'lrf', 'mobi',
|
'epub', 'lit', 'lrx', 'lrf', 'mobi',
|
||||||
'rb', 'imp', 'azw', 'snb'
|
'azw', 'azw3', 'azw1', 'rb', 'imp', 'snb'
|
||||||
]
|
]
|
||||||
|
|
||||||
# The priorities for loading metadata from different file types
|
# The priorities for loading metadata from different file types
|
||||||
|
@ -341,11 +341,14 @@ class MetadataUpdater(object):
|
|||||||
kindle_pdoc = None
|
kindle_pdoc = None
|
||||||
share_not_sync = False
|
share_not_sync = False
|
||||||
if mi.author_sort and pas:
|
if mi.author_sort and pas:
|
||||||
authors = mi.author_sort
|
# We want an EXTH field per author...
|
||||||
update_exth_record((100, normalize(authors).encode(self.codec, 'replace')))
|
authors = mi.author_sort.split(' & ')
|
||||||
|
for author in authors:
|
||||||
|
update_exth_record((100, normalize(author).encode(self.codec, 'replace')))
|
||||||
elif mi.authors:
|
elif mi.authors:
|
||||||
authors = ';'.join(mi.authors)
|
authors = mi.authors
|
||||||
update_exth_record((100, normalize(authors).encode(self.codec, 'replace')))
|
for author in authors:
|
||||||
|
update_exth_record((100, normalize(author).encode(self.codec, 'replace')))
|
||||||
if mi.publisher:
|
if mi.publisher:
|
||||||
update_exth_record((101, normalize(mi.publisher).encode(self.codec, 'replace')))
|
update_exth_record((101, normalize(mi.publisher).encode(self.codec, 'replace')))
|
||||||
if mi.comments:
|
if mi.comments:
|
||||||
@ -360,6 +363,7 @@ class MetadataUpdater(object):
|
|||||||
if mi.isbn:
|
if mi.isbn:
|
||||||
update_exth_record((104, mi.isbn.encode(self.codec, 'replace')))
|
update_exth_record((104, mi.isbn.encode(self.codec, 'replace')))
|
||||||
if mi.tags:
|
if mi.tags:
|
||||||
|
# FIXME: Keep a single subject per EXTH field?
|
||||||
subjects = '; '.join(mi.tags)
|
subjects = '; '.join(mi.tags)
|
||||||
update_exth_record((105, normalize(subjects).encode(self.codec, 'replace')))
|
update_exth_record((105, normalize(subjects).encode(self.codec, 'replace')))
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ def get_metadata(stream):
|
|||||||
if stream_type:
|
if stream_type:
|
||||||
stream_type = stream_type[1:]
|
stream_type = stream_type[1:]
|
||||||
if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
|
if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
|
||||||
'rb', 'imp', 'pdf', 'lrf', 'azw'):
|
'rb', 'imp', 'pdf', 'lrf', 'azw', 'azw1', 'azw3'):
|
||||||
with TemporaryDirectory() as tdir:
|
with TemporaryDirectory() as tdir:
|
||||||
with CurrentDir(tdir):
|
with CurrentDir(tdir):
|
||||||
stream = extract_member(path, match=None, name=f,
|
stream = extract_member(path, match=None, name=f,
|
||||||
|
@ -23,7 +23,7 @@ def get_metadata(stream):
|
|||||||
if stream_type:
|
if stream_type:
|
||||||
stream_type = stream_type[1:]
|
stream_type = stream_type[1:]
|
||||||
if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
|
if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
|
||||||
'rb', 'imp', 'pdf', 'lrf', 'azw'):
|
'rb', 'imp', 'pdf', 'lrf', 'azw', 'azw1', 'azw3'):
|
||||||
with TemporaryDirectory() as tdir:
|
with TemporaryDirectory() as tdir:
|
||||||
with CurrentDir(tdir):
|
with CurrentDir(tdir):
|
||||||
path = zf.extract(f)
|
path = zf.extract(f)
|
||||||
|
@ -116,61 +116,83 @@ class Record(object): # {{{
|
|||||||
# EXTH {{{
|
# EXTH {{{
|
||||||
class EXTHRecord(object):
|
class EXTHRecord(object):
|
||||||
|
|
||||||
def __init__(self, type_, data):
|
def __init__(self, type_, data, length):
|
||||||
self.type = type_
|
self.type = type_
|
||||||
self.data = data
|
self.data = data
|
||||||
|
self.length = length
|
||||||
self.name = {
|
self.name = {
|
||||||
1 : 'DRM Server id',
|
1 : 'Drm Server Id',
|
||||||
2 : 'DRM Commerce id',
|
2 : 'Drm Commerce Id',
|
||||||
3 : 'DRM ebookbase book id',
|
3 : 'Drm Ebookbase Book Id',
|
||||||
100 : 'author',
|
100 : 'Creator',
|
||||||
101 : 'publisher',
|
101 : 'Publisher',
|
||||||
102 : 'imprint',
|
102 : 'Imprint',
|
||||||
103 : 'description',
|
103 : 'Description',
|
||||||
104 : 'isbn',
|
104 : 'ISBN',
|
||||||
105 : 'subject',
|
105 : 'Subject',
|
||||||
106 : 'publishingdate',
|
106 : 'Published',
|
||||||
107 : 'review',
|
107 : 'Review',
|
||||||
108 : 'contributor',
|
108 : 'Contributor',
|
||||||
109 : 'rights',
|
109 : 'Rights',
|
||||||
110 : 'subjectcode',
|
110 : 'SubjectCode',
|
||||||
111 : 'type',
|
111 : 'Type',
|
||||||
112 : 'source',
|
112 : 'Source',
|
||||||
113 : 'asin',
|
113 : 'ASIN',
|
||||||
114 : 'versionnumber',
|
114 : 'versionNumber',
|
||||||
115 : 'sample',
|
115 : 'sample',
|
||||||
116 : 'startreading',
|
116 : 'StartOffset',
|
||||||
117 : 'adult',
|
117 : 'Adult',
|
||||||
118 : 'retailprice',
|
118 : 'Price',
|
||||||
119 : 'retailpricecurrency',
|
119 : 'Currency',
|
||||||
121 : 'KF8 header section index',
|
121 : 'KF8_Boundary_Section',
|
||||||
125 : 'KF8 resources (images/fonts) count',
|
122 : 'fixed-layout',
|
||||||
129 : 'KF8 cover URI',
|
123 : 'book-type',
|
||||||
131 : 'KF8 unknown count',
|
124 : 'orientation-lock',
|
||||||
201 : 'coveroffset',
|
125 : 'KF8_Count_of_Resources_Fonts_Images',
|
||||||
202 : 'thumboffset',
|
126 : 'original-resolution',
|
||||||
203 : 'hasfakecover',
|
127 : 'zero-gutter',
|
||||||
|
128 : 'zero-margin',
|
||||||
|
129 : 'KF8_Masthead/Cover_Image',
|
||||||
|
131 : 'KF8_Unidentified_Count',
|
||||||
|
132 : 'RegionMagnification',
|
||||||
|
200 : 'DictShortName',
|
||||||
|
201 : 'CoverOffset',
|
||||||
|
202 : 'ThumbOffset',
|
||||||
|
203 : 'Fake Cover',
|
||||||
204 : 'Creator Software',
|
204 : 'Creator Software',
|
||||||
205 : 'Creator Major Version', # '>I'
|
205 : 'Creator Major Version', # '>I'
|
||||||
206 : 'Creator Minor Version', # '>I'
|
206 : 'Creator Minor Version', # '>I'
|
||||||
207 : 'Creator Build Number', # '>I'
|
207 : 'Creator Build Number', # '>I'
|
||||||
208 : 'watermark',
|
208 : 'Watermark',
|
||||||
209 : 'tamper_proof_keys',
|
209 : 'Tamper Proof Keys [hex]',
|
||||||
300 : 'fontsignature',
|
300 : 'Font Signature [hex]',
|
||||||
301 : 'clippinglimit', # percentage '>B'
|
301 : 'Clipping Limit [3xx]', # percentage '>B'
|
||||||
402 : 'publisherlimit',
|
401 : 'Clipping Limit', # percentage '>B'
|
||||||
404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled
|
402 : 'Publisher Limit',
|
||||||
501 : 'cdetype', # 4 chars (PDOC or EBOK)
|
404 : 'Text to Speech Disabled', # '>B' 1 - TTS disabled 0 - TTS enabled
|
||||||
502 : 'lastupdatetime',
|
501 : 'CDE Type', # 4 chars (PDOC, EBOK, MAGZ, ...)
|
||||||
503 : 'updatedtitle',
|
502 : 'last_update_time',
|
||||||
524 : 'language',
|
503 : 'Updated Title',
|
||||||
|
504 : 'ASIN [5xx]',
|
||||||
|
524 : 'Language',
|
||||||
|
525 : 'TextDirection',
|
||||||
|
528 : 'Unknown_Logical_Value',
|
||||||
|
535 : 'Kindlegen Build-Rev Number',
|
||||||
}.get(self.type, repr(self.type))
|
}.get(self.type, repr(self.type))
|
||||||
|
|
||||||
if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover',
|
if (self.name in {'sample', 'StartOffset', 'CoverOffset', 'ThumbOffset', 'Fake Cover',
|
||||||
'Creator Major Version', 'Creator Minor Version',
|
'Creator Software', 'Creator Major Version', 'Creator Minor Version',
|
||||||
'Creator Build Number', 'Creator Software', 'startreading'} or
|
'Creator Build Number', 'Clipping Limit (3xx)', 'Clipping Limit',
|
||||||
|
'Publisher Limit', 'Text to Speech Disabled'} or
|
||||||
self.type in {121, 125, 131}):
|
self.type in {121, 125, 131}):
|
||||||
self.data, = struct.unpack(b'>I', self.data)
|
if self.length == 9:
|
||||||
|
self.data, = struct.unpack(b'>B', self.data)
|
||||||
|
elif self.length == 10:
|
||||||
|
self.data, = struct.unpack(b'>H', self.data)
|
||||||
|
else:
|
||||||
|
self.data, = struct.unpack(b'>L', self.data)
|
||||||
|
elif self.type in {209, 300}:
|
||||||
|
self.data = bytes(self.data.encode('hex'))
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return '%s (%d): %r'%(self.name, self.type, self.data)
|
return '%s (%d): %r'%(self.name, self.type, self.data)
|
||||||
@ -181,8 +203,8 @@ class EXTHHeader(object):
|
|||||||
self.raw = raw
|
self.raw = raw
|
||||||
if not self.raw.startswith(b'EXTH'):
|
if not self.raw.startswith(b'EXTH'):
|
||||||
raise ValueError('EXTH header does not start with EXTH')
|
raise ValueError('EXTH header does not start with EXTH')
|
||||||
self.length, = struct.unpack(b'>I', self.raw[4:8])
|
self.length, = struct.unpack(b'>L', self.raw[4:8])
|
||||||
self.count, = struct.unpack(b'>I', self.raw[8:12])
|
self.count, = struct.unpack(b'>L', self.raw[8:12])
|
||||||
|
|
||||||
pos = 12
|
pos = 12
|
||||||
self.records = []
|
self.records = []
|
||||||
@ -199,9 +221,9 @@ class EXTHHeader(object):
|
|||||||
return getattr(ans, 'data', default)
|
return getattr(ans, 'data', default)
|
||||||
|
|
||||||
def read_record(self, pos):
|
def read_record(self, pos):
|
||||||
type_, length = struct.unpack(b'>II', self.raw[pos:pos+8])
|
type_, length = struct.unpack(b'>LL', self.raw[pos:pos+8])
|
||||||
data = self.raw[(pos+8):(pos+length)]
|
data = self.raw[(pos+8):(pos+length)]
|
||||||
self.records.append(EXTHRecord(type_, data))
|
self.records.append(EXTHRecord(type_, data, length))
|
||||||
return pos + length
|
return pos + length
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -802,7 +802,7 @@ def inspect_mobi(mobi_file, ddir):
|
|||||||
alltext += rec.raw
|
alltext += rec.raw
|
||||||
of.seek(0)
|
of.seek(0)
|
||||||
|
|
||||||
root = html.fromstring(alltext.decode('utf-8'))
|
root = html.fromstring(alltext.decode(f.mobi_header.encoding))
|
||||||
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
|
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
|
||||||
of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
|
of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
|
||||||
include_meta_content_type=True))
|
include_meta_content_type=True))
|
||||||
|
@ -88,7 +88,7 @@ class EXTHHeader(object): # {{{
|
|||||||
self.mi.authors = []
|
self.mi.authors = []
|
||||||
au = content.decode(codec, 'ignore').strip()
|
au = content.decode(codec, 'ignore').strip()
|
||||||
self.mi.authors.append(au)
|
self.mi.authors.append(au)
|
||||||
if re.match(r'\S+?\s*,\s+\S+', au.strip()):
|
if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
|
||||||
self.mi.author_sort = au.strip()
|
self.mi.author_sort = au.strip()
|
||||||
elif idx == 101:
|
elif idx == 101:
|
||||||
self.mi.publisher = content.decode(codec, 'ignore').strip()
|
self.mi.publisher = content.decode(codec, 'ignore').strip()
|
||||||
|
@ -13,6 +13,7 @@ from io import BytesIO
|
|||||||
|
|
||||||
from calibre.ebooks.mobi.utils import utf8_text
|
from calibre.ebooks.mobi.utils import utf8_text
|
||||||
from calibre.utils.localization import lang_as_iso639_1
|
from calibre.utils.localization import lang_as_iso639_1
|
||||||
|
from calibre.ebooks.metadata import authors_to_sort_string
|
||||||
|
|
||||||
EXTH_CODES = {
|
EXTH_CODES = {
|
||||||
'creator': 100,
|
'creator': 100,
|
||||||
@ -54,8 +55,8 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
|
|||||||
items = metadata[term]
|
items = metadata[term]
|
||||||
if term == 'creator':
|
if term == 'creator':
|
||||||
if prefer_author_sort:
|
if prefer_author_sort:
|
||||||
creators = [unicode(c.file_as or c) for c in
|
creators = [authors_to_sort_string([unicode(c)]) for c in
|
||||||
items][:1]
|
items]
|
||||||
else:
|
else:
|
||||||
creators = [unicode(c) for c in items]
|
creators = [unicode(c) for c in items]
|
||||||
items = creators
|
items = creators
|
||||||
|
@ -30,7 +30,7 @@ def get_filters():
|
|||||||
(_('LRF Books'), ['lrf']),
|
(_('LRF Books'), ['lrf']),
|
||||||
(_('HTML Books'), ['htm', 'html', 'xhtm', 'xhtml']),
|
(_('HTML Books'), ['htm', 'html', 'xhtm', 'xhtml']),
|
||||||
(_('LIT Books'), ['lit']),
|
(_('LIT Books'), ['lit']),
|
||||||
(_('MOBI Books'), ['mobi', 'prc', 'azw']),
|
(_('MOBI Books'), ['mobi', 'prc', 'azw', 'azw3']),
|
||||||
(_('Topaz books'), ['tpz','azw1']),
|
(_('Topaz books'), ['tpz','azw1']),
|
||||||
(_('Text books'), ['txt', 'text', 'rtf']),
|
(_('Text books'), ['txt', 'text', 'rtf']),
|
||||||
(_('PDF Books'), ['pdf', 'azw4']),
|
(_('PDF Books'), ['pdf', 'azw4']),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user