AZW3: When converting from AZW3 files, use the high quality version of the image if the source AZW3 file contains both low and high quality images

2025-07-09 03:04:10 -04:00 · 2014-07-17 10:41:37 +05:30 · 2014-07-17 10:41:37 +05:30 · 2befb1e2e9
commit 2befb1e2e9
parent 734384b384
3 changed files with 118 additions and 46 deletions
--- a/src/calibre/ebooks/mobi/reader/containers.py
+++ b/src/calibre/ebooks/mobi/reader/containers.py
@ -0,0 +1,55 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
 from struct import unpack_from, error
 from calibre.utils.magick.draw import identify_data
 from calibre.utils.imghdr import what
 def find_imgtype(data):
    imgtype = what(None, data)
    if imgtype is None:
        try:
            imgtype = identify_data(data)[2]
        except Exception:
            imgtype = 'unknown'
    return imgtype
 class Container(object):
    def __init__(self, data):
        self.is_image_container = False
        self.resource_index = 0
        if len(data) > 60 and data[48:52] == b'EXTH':
            length, num_items = unpack_from(b'>LL', data, 52)
            pos = 60
            while pos < 60 + length - 8:
                try:
                    idx, size = unpack_from(b'>LL', data, pos)
                except error:
                    break
                pos += 8
                size -= 8
                if size < 0:
                    break
                if idx == 539:
                    self.is_image_container = data[pos:pos+size] == b'application/image'
                    break
                pos += size
    def load_image(self, data):
        self.resource_index += 1
        if self.is_image_container:
            data = data[12:]
            imgtype = find_imgtype(data)
            if imgtype != 'unknown':
                return data, imgtype
        return None, None
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -119,9 +119,10 @@ class MobiReader(object):
                try:
                    self.book_header = BookHeader(self.sections[k8i][0],
                            self.ident, user_encoding, self.log)
                    self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
                    self.book_header.mobi6_records = bh.records
-                    # Only the first_image_index from the MOBI 6 header is
+                    # Need the first_image_index from the mobi 6 header as well
                    # useful
                    for x in ('first_image_index',):
                        setattr(self.book_header, x, getattr(bh, x))
--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -18,12 +18,12 @@ from calibre.ebooks.mobi.reader.headers import NULL_INDEX
 from calibre.ebooks.mobi.reader.index import read_index
 from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc
 from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup
 from calibre.ebooks.mobi.reader.containers import Container, find_imgtype
 from calibre.ebooks.metadata.opf2 import Guide, OPFCreator
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.mobi.utils import read_font_record
 from calibre.ebooks.oeb.parse_utils import parse_html
 from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
 from calibre.utils.imghdr import what
 Part = namedtuple('Part',
    'num type filename start end aid')
@ -59,6 +59,12 @@ def reverse_tag_iter(block):
        yield block[plt:pgt+1]
        end = plt
 def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
    first_resource_index = first_image_index
    if first_resource_index in {-1, NULL_INDEX}:
        first_resource_index = num_of_text_records + first_text_record_number
    return first_resource_index
 class Mobi8Reader(object):
    def __init__(self, mobi6_reader, log, for_tweak=False):
@ -69,11 +75,16 @@ class Mobi8Reader(object):
    def __call__(self):
        self.mobi6_reader.check_for_drm()
-        offset = 1
+        bh = self.mobi6_reader.book_header
        res_end = len(self.mobi6_reader.sections)
        if self.mobi6_reader.kf8_type == 'joint':
            offset = self.mobi6_reader.kf8_boundary + 2
-            res_end = self.mobi6_reader.kf8_boundary
+            self.resource_offsets = [
                (get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
                (get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
            ]
        else:
            offset = 1
            self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]
        self.processed_records = self.mobi6_reader.extract_text(offset=offset)
        self.raw_ml = self.mobi6_reader.mobi_html
@ -81,18 +92,14 @@ class Mobi8Reader(object):
            f.write(self.raw_ml)
        self.kf8_sections = self.mobi6_reader.sections[offset-1:]
-        first_resource_index = self.header.first_image_index
+
        if first_resource_index in {-1, NULL_INDEX}:
            first_resource_index = self.header.records + 1
        self.resource_sections = \
                self.mobi6_reader.sections[first_resource_index:res_end]
        self.cover_offset = getattr(self.header.exth, 'cover_offset', None)
        self.read_indices()
        self.build_parts()
        guide = self.create_guide()
        ncx = self.create_ncx()
-        resource_map = self.extract_resources()
+        resource_map = self.extract_resources(self.mobi6_reader.sections)
        spine = self.expand_text(resource_map)
        return self.write_opf(guide, ncx, spine, resource_map)
@ -385,19 +392,21 @@ class Mobi8Reader(object):
        # Build the TOC object
        return build_toc(index_entries)
-    def extract_resources(self):
+    def extract_resources(self, sections):
        from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
        resource_map = []
        container = None
        for x in ('fonts', 'images'):
            os.mkdir(x)
-        for i, sec in enumerate(self.resource_sections):
+        for start, end in self.resource_offsets:
            for i, sec in enumerate(sections[start:end]):
                fname_idx = i+1
                data = sec[0]
                typ = data[:4]
                href = None
                if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
-                       b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET'}:
+                        b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
                    pass  # Ignore these records
                elif typ == b'FONT':
                    font = read_font_record(data)
@ -412,15 +421,22 @@ class Mobi8Reader(object):
                                font['raw_data'])
                    if font['encrypted']:
                        self.encrypted_fonts.append(href)
-            else:
+                elif typ == b'CONT':
                    if data == b'CONTBOUNDARY':
                        container = None
                        continue
                    container = Container(data)
                elif typ == b'CRES':
                    data, imgtype = container.load_image(data)
                    if data is not None:
                        href = 'images/%05d.%s'%(container.resource_index, imgtype)
                        with open(href.replace('/', os.sep), 'wb') as f:
                            f.write(data)
                elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4:
                    container.resource_index += 1
                elif container is None:
                    if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
-                    imgtype = what(None, data)
+                        imgtype = find_imgtype(data)
                    if imgtype is None:
                        from calibre.utils.magick.draw import identify_data
                        try:
                            imgtype = identify_data(data)[2]
                        except Exception:
                            imgtype = 'unknown'
                        href = 'images/%05d.%s'%(fname_idx, imgtype)
                        with open(href.replace('/', os.sep), 'wb') as f:
                            f.write(data)