From 6c42c0ea6cef09a945cd887e0af898d7bafc1733 Mon Sep 17 00:00:00 2001 From: NiLuJe Date: Fri, 14 Sep 2012 17:55:15 +0200 Subject: [PATCH 1/4] Produce an output more similar to Mobi Unpack when dumping a MOBI header --- src/calibre/ebooks/mobi/debug/headers.py | 120 ++++++++++++++--------- 1 file changed, 72 insertions(+), 48 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug/headers.py b/src/calibre/ebooks/mobi/debug/headers.py index 5b80a46f1b..7e2fc6f3cb 100644 --- a/src/calibre/ebooks/mobi/debug/headers.py +++ b/src/calibre/ebooks/mobi/debug/headers.py @@ -116,61 +116,85 @@ class Record(object): # {{{ # EXTH {{{ class EXTHRecord(object): - def __init__(self, type_, data): + def __init__(self, type_, data, length): self.type = type_ self.data = data + self.length = length self.name = { - 1 : 'DRM Server id', - 2 : 'DRM Commerce id', - 3 : 'DRM ebookbase book id', - 100 : 'author', - 101 : 'publisher', - 102 : 'imprint', - 103 : 'description', - 104 : 'isbn', - 105 : 'subject', - 106 : 'publishingdate', - 107 : 'review', - 108 : 'contributor', - 109 : 'rights', - 110 : 'subjectcode', - 111 : 'type', - 112 : 'source', - 113 : 'asin', - 114 : 'versionnumber', + 1 : 'Drm Server Id', + 2 : 'Drm Commerce Id', + 3 : 'Drm Ebookbase Book Id', + 100 : 'Creator', + 101 : 'Publisher', + 102 : 'Imprint', + 103 : 'Description', + 104 : 'ISBN', + 105 : 'Subject', + 106 : 'Published', + 107 : 'Review', + 108 : 'Contributor', + 109 : 'Rights', + 110 : 'SubjectCode', + 111 : 'Type', + 112 : 'Source', + 113 : 'ASIN', + 114 : 'versionNumber', 115 : 'sample', - 116 : 'startreading', - 117 : 'adult', - 118 : 'retailprice', - 119 : 'retailpricecurrency', - 121 : 'KF8 header section index', - 125 : 'KF8 resources (images/fonts) count', - 129 : 'KF8 cover URI', - 131 : 'KF8 unknown count', - 201 : 'coveroffset', - 202 : 'thumboffset', - 203 : 'hasfakecover', + 116 : 'StartOffset', + 117 : 'Adult', + 118 : 'Price', + 119 : 'Currency', + 121 : 'KF8_Boundary_Section', + 122 : 'fixed-layout', + 123 : 'book-type', + 124 : 'orientation-lock', + 125 : 'KF8_Count_of_Resources_Fonts_Images', + 126 : 'original-resolution', + 127 : 'zero-gutter', + 128 : 'zero-margin', + 129 : 'KF8_Masthead/Cover_Image', + 131 : 'KF8_Unidentified_Count', + 132 : 'RegionMagnification', + 200 : 'DictShortName', + 201 : 'CoverOffset', + 202 : 'ThumbOffset', + 203 : 'Fake Cover', 204 : 'Creator Software', 205 : 'Creator Major Version', # '>I' 206 : 'Creator Minor Version', # '>I' 207 : 'Creator Build Number', # '>I' - 208 : 'watermark', - 209 : 'tamper_proof_keys', - 300 : 'fontsignature', - 301 : 'clippinglimit', # percentage '>B' - 402 : 'publisherlimit', - 404 : 'TTS flag', # '>B' 1 - TTS disabled 0 - TTS enabled - 501 : 'cdetype', # 4 chars (PDOC or EBOK) - 502 : 'lastupdatetime', - 503 : 'updatedtitle', - 524 : 'language', + 208 : 'Watermark', + 209 : 'Tamper Proof Keys [hex]', + 300 : 'Font Signature [hex]', + 301 : 'Clipping Limit [3xx]', # percentage '>B' + 401 : 'Clipping Limit', # percentage '>B' + 402 : 'Publisher Limit', + 404 : 'Text to Speech Disabled', # '>B' 1 - TTS disabled 0 - TTS enabled + 501 : 'CDE Type', # 4 chars (PDOC, EBOK, MAGZ, ...) + 502 : 'last_update_time', + 503 : 'Updated Title', + 504 : 'ASIN [5xx]', + 524 : 'Language', + 525 : 'TextDirection', + 528 : 'Unknown_Logical_Value', + 535 : 'Kindlegen Build-Rev Number', }.get(self.type, repr(self.type)) - if (self.name in {'coveroffset', 'thumboffset', 'hasfakecover', - 'Creator Major Version', 'Creator Minor Version', - 'Creator Build Number', 'Creator Software', 'startreading'} or + if (self.name in {'sample', 'StartOffset', 'CoverOffset', 'ThumbOffset', 'Fake Cover', + 'Creator Software', 'Creator Major Version', 'Creator Minor Version', + 'Creator Build Number', 'Clipping Limit (3xx)', 'Clipping Limit', + 'Publisher Limit', 'Text to Speech Disabled'} or self.type in {121, 125, 131}): - self.data, = struct.unpack(b'>I', self.data) + if self.length == 9: + self.data, = struct.unpack(b'>B', self.data) + elif self.length == 10: + self.data, = struct.unpack(b'>H', self.data) + elif self.length == 12: + self.data, = struct.unpack(b'>L', self.data) + else: + self.data, = struct.unpack(b'>I', self.data) # Assume I for unknown sizes... + elif self.type in {209, 300}: + self.data = bytes(self.data.encode('hex')) def __str__(self): return '%s (%d): %r'%(self.name, self.type, self.data) @@ -181,8 +205,8 @@ class EXTHHeader(object): self.raw = raw if not self.raw.startswith(b'EXTH'): raise ValueError('EXTH header does not start with EXTH') - self.length, = struct.unpack(b'>I', self.raw[4:8]) - self.count, = struct.unpack(b'>I', self.raw[8:12]) + self.length, = struct.unpack(b'>L', self.raw[4:8]) + self.count, = struct.unpack(b'>L', self.raw[8:12]) pos = 12 self.records = [] @@ -199,9 +223,9 @@ class EXTHHeader(object): return getattr(ans, 'data', default) def read_record(self, pos): - type_, length = struct.unpack(b'>II', self.raw[pos:pos+8]) + type_, length = struct.unpack(b'>LL', self.raw[pos:pos+8]) data = self.raw[(pos+8):(pos+length)] - self.records.append(EXTHRecord(type_, data)) + self.records.append(EXTHRecord(type_, data, length)) return pos + length @property From 8297eebb61a5b3987d4d4452dc22377b3ab89690 Mon Sep 17 00:00:00 2001 From: NiLuJe Date: Fri, 14 Sep 2012 17:56:15 +0200 Subject: [PATCH 2/4] Don't horribly die when dumping a mobi whose HTML content is not encoded in UTF-8 --- src/calibre/ebooks/mobi/debug/mobi6.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/debug/mobi6.py b/src/calibre/ebooks/mobi/debug/mobi6.py index fb5674653c..938629e391 100644 --- a/src/calibre/ebooks/mobi/debug/mobi6.py +++ b/src/calibre/ebooks/mobi/debug/mobi6.py @@ -802,7 +802,7 @@ def inspect_mobi(mobi_file, ddir): alltext += rec.raw of.seek(0) - root = html.fromstring(alltext.decode('utf-8')) + root = html.fromstring(alltext.decode(f.mobi_header.encoding)) with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: of.write(html.tostring(root, pretty_print=True, encoding='utf-8', include_meta_content_type=True)) From ca467cdb2ee2d7ed9c858e54dab6e64f321d514f Mon Sep 17 00:00:00 2001 From: NiLuJe Date: Fri, 14 Sep 2012 17:58:09 +0200 Subject: [PATCH 3/4] Split each author into its own EXTH Creator field, even with the 'Use author_sort as author' setting. Also, in a slightly unrelated note, sprinkle azw3 awareness in a few places. --- src/calibre/ebooks/__init__.py | 2 +- src/calibre/ebooks/metadata/archive.py | 2 +- src/calibre/ebooks/metadata/meta.py | 4 ++-- src/calibre/ebooks/metadata/mobi.py | 12 ++++++++---- src/calibre/ebooks/metadata/rar.py | 2 +- src/calibre/ebooks/metadata/zip.py | 2 +- src/calibre/ebooks/mobi/writer8/exth.py | 10 +++++++++- src/calibre/gui2/actions/add.py | 2 +- src/calibre/gui2/wizard/__init__.py | 2 +- 9 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index da820cffbb..ee880000f0 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -163,7 +163,7 @@ def render_html(path_to_html, width=590, height=750, as_xhtml=True): def check_ebook_format(stream, current_guess): ans = current_guess - if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1'): + if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1', 'azw3'): stream.seek(0) if stream.read(3) == 'TPZ': ans = 'tpz' diff --git a/src/calibre/ebooks/metadata/archive.py b/src/calibre/ebooks/metadata/archive.py index b9136e5a13..e28389d7f7 100644 --- a/src/calibre/ebooks/metadata/archive.py +++ b/src/calibre/ebooks/metadata/archive.py @@ -70,7 +70,7 @@ class ArchiveExtract(FileTypePlugin): fname = fnames[0] ext = os.path.splitext(fname)[1][1:] if ext.lower() not in ('lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', - 'mp3', 'pdb', 'azw', 'azw1', 'fb2'): + 'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2'): return archive of = self.temporary_file('_archive_extract.'+ext) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 61afe3c49c..83d109fcef 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -15,7 +15,7 @@ _METADATA_PRIORITIES = [ 'html', 'htm', 'xhtml', 'xhtm', 'rtf', 'fb2', 'pdf', 'prc', 'odt', 'epub', 'lit', 'lrx', 'lrf', 'mobi', - 'rb', 'imp', 'azw', 'snb' + 'rb', 'imp', 'azw', 'azw3', 'azw1' 'snb' ] # The priorities for loading metadata from different file types @@ -85,7 +85,7 @@ def _get_metadata(stream, stream_type, use_libprs_metadata, if stream_type: stream_type = stream_type.lower() if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'): stream_type = 'html' - if stream_type in ('mobi', 'prc', 'azw'): + if stream_type in ('mobi', 'prc', 'azw', 'azw1', 'azw3'): stream_type = 'mobi' if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'): stream_type = 'odt' diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index 846015f491..e701946c01 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -341,11 +341,14 @@ class MetadataUpdater(object): kindle_pdoc = None share_not_sync = False if mi.author_sort and pas: - authors = mi.author_sort - update_exth_record((100, normalize(authors).encode(self.codec, 'replace'))) + # We want an EXTH field per author... + authors = mi.author_sort.split(' & ') + for author in authors: + update_exth_record((100, normalize(author).encode(self.codec, 'replace'))) elif mi.authors: - authors = ';'.join(mi.authors) - update_exth_record((100, normalize(authors).encode(self.codec, 'replace'))) + authors = mi.authors + for author in authors: + update_exth_record((100, normalize(author).encode(self.codec, 'replace'))) if mi.publisher: update_exth_record((101, normalize(mi.publisher).encode(self.codec, 'replace'))) if mi.comments: @@ -360,6 +363,7 @@ class MetadataUpdater(object): if mi.isbn: update_exth_record((104, mi.isbn.encode(self.codec, 'replace'))) if mi.tags: + # FIXME: Keep a single subject per EXTH field? subjects = '; '.join(mi.tags) update_exth_record((105, normalize(subjects).encode(self.codec, 'replace'))) diff --git a/src/calibre/ebooks/metadata/rar.py b/src/calibre/ebooks/metadata/rar.py index a9b5d45546..58ca283a1a 100644 --- a/src/calibre/ebooks/metadata/rar.py +++ b/src/calibre/ebooks/metadata/rar.py @@ -32,7 +32,7 @@ def get_metadata(stream): if stream_type: stream_type = stream_type[1:] if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub', - 'rb', 'imp', 'pdf', 'lrf', 'azw'): + 'rb', 'imp', 'pdf', 'lrf', 'azw', 'azw1', 'azw3'): with TemporaryDirectory() as tdir: with CurrentDir(tdir): stream = extract_member(path, match=None, name=f, diff --git a/src/calibre/ebooks/metadata/zip.py b/src/calibre/ebooks/metadata/zip.py index 887975b993..7369d2055c 100644 --- a/src/calibre/ebooks/metadata/zip.py +++ b/src/calibre/ebooks/metadata/zip.py @@ -23,7 +23,7 @@ def get_metadata(stream): if stream_type: stream_type = stream_type[1:] if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub', - 'rb', 'imp', 'pdf', 'lrf', 'azw'): + 'rb', 'imp', 'pdf', 'lrf', 'azw', 'azw1', 'azw3'): with TemporaryDirectory() as tdir: with CurrentDir(tdir): path = zf.extract(f) diff --git a/src/calibre/ebooks/mobi/writer8/exth.py b/src/calibre/ebooks/mobi/writer8/exth.py index 508b77ce5b..fa0da0d2ee 100644 --- a/src/calibre/ebooks/mobi/writer8/exth.py +++ b/src/calibre/ebooks/mobi/writer8/exth.py @@ -54,8 +54,16 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False, items = metadata[term] if term == 'creator': if prefer_author_sort: - creators = [unicode(c.file_as or c) for c in + # This is a bit hackish... We only get the first item in the creators list, + # because we only care about the file_as property, and it contains *all* the authors in every creator markup, + # so we only need one, or we end up with duplicates ;). + # We then end up with a single item in our list, that contains every authors, in author sort syntax, separated by an ' & ' character. + # That's not good enough, because we want each author in a separate entry in the list, so we just split this on every & ;). + # This way, we properly end up with multiple Creator fields in the EXTH header, one for each author, like KindleGen :). + all_creators = [unicode(c.file_as or c) for c in items][:1] + for creator in all_creators: + creators = creator.split(' & ') else: creators = [unicode(c) for c in items] items = creators diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py index ef7ed7a594..4f3e9fc066 100644 --- a/src/calibre/gui2/actions/add.py +++ b/src/calibre/gui2/actions/add.py @@ -30,7 +30,7 @@ def get_filters(): (_('LRF Books'), ['lrf']), (_('HTML Books'), ['htm', 'html', 'xhtm', 'xhtml']), (_('LIT Books'), ['lit']), - (_('MOBI Books'), ['mobi', 'prc', 'azw']), + (_('MOBI Books'), ['mobi', 'prc', 'azw', 'azw3']), (_('Topaz books'), ['tpz','azw1']), (_('Text books'), ['txt', 'text', 'rtf']), (_('PDF Books'), ['pdf', 'azw4']), diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index d831307d9a..569c72ae55 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -446,7 +446,7 @@ class KindlePage(QWizardPage, KindleUI): if not accounts: accounts = {} for y in accounts.values(): y[2] = False - accounts[x] = ['AZW, MOBI, TPZ, PRC, AZW1', True, True] + accounts[x] = ['AZW, MOBI, TPZ, PRC, AZW1, AZW3', True, True] conf.set('accounts', accounts) def nextId(self): From e669ffd8eccedb80f444e124ad97a3b08aa46c5a Mon Sep 17 00:00:00 2001 From: NiLuJe Date: Fri, 14 Sep 2012 18:49:13 +0200 Subject: [PATCH 4/4] Nope, Amazon still doesn't allow sending AZW3 over email, my bad. --- src/calibre/gui2/wizard/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index 569c72ae55..d831307d9a 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -446,7 +446,7 @@ class KindlePage(QWizardPage, KindleUI): if not accounts: accounts = {} for y in accounts.values(): y[2] = False - accounts[x] = ['AZW, MOBI, TPZ, PRC, AZW1, AZW3', True, True] + accounts[x] = ['AZW, MOBI, TPZ, PRC, AZW1', True, True] conf.set('accounts', accounts) def nextId(self):