AZW3: When converting from AZW3 files, use the high quality version of the image if the source AZW3 file contains both low and high quality images

2025-07-09 03:04:10 -04:00 · 2014-07-17 10:41:37 +05:30 · 2014-07-17 10:41:37 +05:30 · 2befb1e2e9
commit 2befb1e2e9
parent 734384b384
3 changed files with 118 additions and 46 deletions
--- a/src/calibre/ebooks/mobi/reader/containers.py
+++ b/src/calibre/ebooks/mobi/reader/containers.py
@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from struct import unpack_from, error
+
+from calibre.utils.magick.draw import identify_data
+from calibre.utils.imghdr import what
+
+def find_imgtype(data):
+    imgtype = what(None, data)
+    if imgtype is None:
+        try:
+            imgtype = identify_data(data)[2]
+        except Exception:
+            imgtype = 'unknown'
+    return imgtype
+
+class Container(object):
+
+    def __init__(self, data):
+        self.is_image_container = False
+        self.resource_index = 0
+
+        if len(data) > 60 and data[48:52] == b'EXTH':
+            length, num_items = unpack_from(b'>LL', data, 52)
+            pos = 60
+            while pos < 60 + length - 8:
+                try:
+                    idx, size = unpack_from(b'>LL', data, pos)
+                except error:
+                    break
+                pos += 8
+                size -= 8
+                if size < 0:
+                    break
+                if idx == 539:
+                    self.is_image_container = data[pos:pos+size] == b'application/image'
+                    break
+                pos += size
+
+    def load_image(self, data):
+        self.resource_index += 1
+        if self.is_image_container:
+            data = data[12:]
+            imgtype = find_imgtype(data)
+            if imgtype != 'unknown':
+                return data, imgtype
+        return None, None
+
+
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -119,9 +119,10 @@ class MobiReader(object):
                try:
                    self.book_header = BookHeader(self.sections[k8i][0],
                            self.ident, user_encoding, self.log)
+                    self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
+                    self.book_header.mobi6_records = bh.records

-                    # Only the first_image_index from the MOBI 6 header is
-                    # useful
+                    # Need the first_image_index from the mobi 6 header as well
                    for x in ('first_image_index',):
                        setattr(self.book_header, x, getattr(bh, x))

--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -18,12 +18,12 @@ from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import read_index
 from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
 from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
+from calibre.ebooks.mobi.reader.containers import Container, find_imgtype
 from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.mobi.utils import read_font_record
 from calibre.ebooks.oeb.parse_utils import parse_html
 from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
-from calibre.utils.imghdr import what

 Part = namedtuple('Part',
    'num type filename start end aid')
@ -59,6 +59,12 @@ def reverse_tag_iter(block):
        yield block[plt:pgt+1]
        end = plt

+def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
+    first_resource_index = first_image_index
+    if first_resource_index in {-1, NULL_INDEX}:
+        first_resource_index = num_of_text_records + first_text_record_number
+    return first_resource_index
+
 class Mobi8Reader(object):

    def __init__(self, mobi6_reader, log, for_tweak=False):
@ -69,11 +75,16 @@ class Mobi8Reader(object):

    def __call__(self):
        self.mobi6_reader.check_for_drm()
-        offset = 1
-        res_end = len(self.mobi6_reader.sections)
+        bh = self.mobi6_reader.book_header
        if self.mobi6_reader.kf8_type == 'joint':
            offset = self.mobi6_reader.kf8_boundary + 2
-            res_end = self.mobi6_reader.kf8_boundary
+            self.resource_offsets = [
+                (get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
+                (get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
+            ]
+        else:
+            offset = 1
+            self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]

        self.processed_records = self.mobi6_reader.extract_text(offset=offset)
        self.raw_ml = self.mobi6_reader.mobi_html
@ -81,18 +92,14 @@ class Mobi8Reader(object):
            f.write(self.raw_ml)

        self.kf8_sections = self.mobi6_reader.sections[offset-1:]
-        first_resource_index = self.header.first_image_index
-        if first_resource_index in {-1, NULL_INDEX}:
-            first_resource_index = self.header.records + 1
-        self.resource_sections = \
-                self.mobi6_reader.sections[first_resource_index:res_end]
+
        self.cover_offset = getattr(self.header.exth, 'cover_offset', None)

        self.read_indices()
        self.build_parts()
        guide = self.create_guide()
        ncx = self.create_ncx()
-        resource_map = self.extract_resources()
+        resource_map = self.extract_resources(self.mobi6_reader.sections)
        spine = self.expand_text(resource_map)
        return self.write_opf(guide, ncx, spine, resource_map)

@ -385,47 +392,56 @@ class Mobi8Reader(object):
        # Build the TOC object
        return build_toc(index_entries)

-    def extract_resources(self):
+    def extract_resources(self, sections):
        from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
        resource_map = []
+        container = None
        for x in ('fonts', 'images'):
            os.mkdir(x)

-        for i, sec in enumerate(self.resource_sections):
-            fname_idx = i+1
-            data = sec[0]
-            typ = data[:4]
-            href = None
-            if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
-                       b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET'}:
-                pass  # Ignore these records
-            elif typ == b'FONT':
-                font = read_font_record(data)
-                href = "fonts/%05d.%s" % (fname_idx, font['ext'])
-                if font['err']:
-                    self.log.warn('Reading font record %d failed: %s'%(
-                        fname_idx, font['err']))
-                    if font['headers']:
-                        self.log.debug('Font record headers: %s'%font['headers'])
-                with open(href.replace('/', os.sep), 'wb') as f:
-                    f.write(font['font_data'] if font['font_data'] else
-                            font['raw_data'])
-                if font['encrypted']:
-                    self.encrypted_fonts.append(href)
-            else:
-                if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
-                    imgtype = what(None, data)
-                    if imgtype is None:
-                        from calibre.utils.magick.draw import identify_data
-                        try:
-                            imgtype = identify_data(data)[2]
-                        except Exception:
-                            imgtype = 'unknown'
-                    href = 'images/%05d.%s'%(fname_idx, imgtype)
+        for start, end in self.resource_offsets:
+            for i, sec in enumerate(sections[start:end]):
+                fname_idx = i+1
+                data = sec[0]
+                typ = data[:4]
+                href = None
+                if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
+                        b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
+                    pass  # Ignore these records
+                elif typ == b'FONT':
+                    font = read_font_record(data)
+                    href = "fonts/%05d.%s" % (fname_idx, font['ext'])
+                    if font['err']:
+                        self.log.warn('Reading font record %d failed: %s'%(
+                            fname_idx, font['err']))
+                        if font['headers']:
+                            self.log.debug('Font record headers: %s'%font['headers'])
                    with open(href.replace('/', os.sep), 'wb') as f:
-                        f.write(data)
+                        f.write(font['font_data'] if font['font_data'] else
+                                font['raw_data'])
+                    if font['encrypted']:
+                        self.encrypted_fonts.append(href)
+                elif typ == b'CONT':
+                    if data == b'CONTBOUNDARY':
+                        container = None
+                        continue
+                    container = Container(data)
+                elif typ == b'CRES':
+                    data, imgtype = container.load_image(data)
+                    if data is not None:
+                        href = 'images/%05d.%s'%(container.resource_index, imgtype)
+                        with open(href.replace('/', os.sep), 'wb') as f:
+                            f.write(data)
+                elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4:
+                    container.resource_index += 1
+                elif container is None:
+                    if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
+                        imgtype = find_imgtype(data)
+                        href = 'images/%05d.%s'%(fname_idx, imgtype)
+                        with open(href.replace('/', os.sep), 'wb') as f:
+                            f.write(data)

-            resource_map.append(href)
+                resource_map.append(href)

        return resource_map