Book polishing: Add tool to losslessly compress images in the book in order to reduce its filesize, without affecting image quality

2025-07-09 03:04:10 -04:00 · 2015-11-27 16:02:56 +05:30 · 2015-11-27 16:02:56 +05:30 · a7489de7cb
commit a7489de7cb
parent b47f7b8b45
5 changed files with 196 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/polish/images.py
+++ b/src/calibre/ebooks/oeb/polish/images.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+import os
+from functools import partial
+from threading import Thread
+from Queue import Queue, Empty
+
+from calibre import detect_ncpus, human_readable
+
+class Worker(Thread):
+
+    daemon = True
+
+    def __init__(self, name, queue, results, container, jpeg_quality):
+        Thread.__init__(self, name=name)
+        self.queue, self.results, self.container = queue, results, container
+        self.jpeg_quality = jpeg_quality
+        self.start()
+
+    def run(self):
+        while True:
+            try:
+                name = self.queue.get_nowait()
+            except Empty:
+                break
+            try:
+                self.compress(name)
+            except Exception:
+                import traceback
+                self.results[name] = (False, traceback.format_exc())
+            finally:
+                self.queue.task_done()
+
+    def compress(self, name):
+        from calibre.utils.img import optimize_png, optimize_jpeg, encode_jpeg
+        mt = self.container.mime_map[name]
+        if 'png' in mt:
+            func = optimize_png
+        elif self.jpeg_quality is None:
+            func = optimize_jpeg
+        else:
+            func = partial(encode_jpeg, quality=self.jpeg_quality)
+        path = self.container.get_file_path_for_processing(name)
+        before = os.path.getsize(path)
+        func(path)
+        after = os.path.getsize(path)
+        self.results[name] = (True, (before, after))
+
+
+def compress_images(container, report=None, names=None, jpeg_quality=None):
+    mt_map = container.manifest_type_map
+    images = set()
+    for mt in 'png jpg jpeg'.split():
+        images |= set(mt_map.get('image/' + mt, ()))
+    if names is not None:
+        images &= set(names)
+    results = {}
+    queue = Queue()
+    for name in images:
+        queue.put(name)
+    [Worker('CompressImage%d' % i, queue, results, container, jpeg_quality) for i in xrange(min(detect_ncpus(), len(images)))]
+    queue.join()
+    before_total = after_total = 0
+    for name, (ok, res) in results.iteritems():
+        if ok:
+            before, after = res
+            if before != after:
+                before_total += before
+                after_total += after
+                if report:
+                    report(_('{0} compressed from {1} to {2} bytes [{3:.1%}]').format(
+                        name, human_readable(before), human_readable(after), after/before))
+        else:
+            report(_('Failed to process {0} with error:').format(name))
+            report(res)
+    if report:
+        if before_total > 0:
+            report('')
+            report(_('Total image filesize reduced from {0} to {1} [{2:.1%}]').format(
+                human_readable(before_total), human_readable(after_total), after_total/before_total))
+        else:
+            report(_('Images are already fully optimized'))
+    return before_total > 0, results
--- a/src/calibre/ebooks/oeb/polish/main.py
+++ b/src/calibre/ebooks/oeb/polish/main.py
@ -14,6 +14,7 @@ from functools import partial
 from calibre.ebooks.oeb.polish.container import get_container
 from calibre.ebooks.oeb.polish.stats import StatsCollector
 from calibre.ebooks.oeb.polish.subset import subset_all_fonts
+from calibre.ebooks.oeb.polish.images import compress_images
 from calibre.ebooks.oeb.polish.embed import embed_all_fonts
 from calibre.ebooks.oeb.polish.cover import set_cover
 from calibre.ebooks.oeb.polish.replace import smarten_punctuation
@ -31,6 +32,7 @@ ALL_OPTS = {
    'remove_jacket':False,
    'smarten_punctuation':False,
    'remove_unused_css':False,
+    'compress_images': False,
 }

 CUSTOMIZATION = {
@ -103,6 +105,12 @@ created from production templates can have a large number of extra CSS rules
 that dont match any actual content. These extra rules can slow down readers
 that need to parse them all.</p>
 '''),
+
+'compress_images': _('''\
+<p>Losslessly compress images in the book, to reduce the filesize, without
+affecting image quality.</p>
+'''),
+
 }

 def hfix(name, raw):
@ -203,6 +211,12 @@ def polish_one(ebook, opts, report, customization=None):
            changed = True
        report('')

+    if opts.compress_images:
+        rt(_('Losslessly compressing images'))
+        if compress_images(ebook, report)[0]:
+            changed = True
+        report('')
+
    return changed


@ -265,6 +279,7 @@ def option_parser():
    o('--remove-jacket', help=CLI_HELP['remove_jacket'])
    o('--smarten-punctuation', '-p', help=CLI_HELP['smarten_punctuation'])
    o('--remove-unused-css', '-u', help=CLI_HELP['remove_unused_css'])
+    o('--compress-images', '-i', help=CLI_HELP['compress_images'])

    o('--verbose', help=_('Produce more verbose output, useful for debugging.'))

--- a/src/calibre/gui2/actions/polish.py
+++ b/src/calibre/gui2/actions/polish.py
@ -67,6 +67,7 @@ class Polish(QDialog):  # {{{
            'jacket':_('<h3>Book Jacket</h3>%s')%HELP['jacket'],
            'remove_jacket':_('<h3>Remove Book Jacket</h3>%s')%HELP['remove_jacket'],
            'remove_unused_css':_('<h3>Remove unused CSS rules</h3>%s')%HELP['remove_unused_css'],
+            'compress_images': _('<h3>Losslessly compress images</h3>%s') % HELP['compress_images'],
        }

        self.l = l = QGridLayout()
@ -85,6 +86,7 @@ class Polish(QDialog):  # {{{
            ('jacket', _('Add/Replace metadata as a "book &jacket" page')),
            ('remove_jacket', _('&Remove a previously inserted book jacket')),
            ('remove_unused_css', _('Remove &unused CSS rules from the book')),
+            ('compress_images', _('Losslessly compress images')),
        ])
        prefs = gprefs.get('polishing_settings', {})
        for name, text in self.all_actions.iteritems():
--- a/src/calibre/utils/img.py
+++ b/src/calibre/utils/img.py
@ -4,10 +4,23 @@

 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
+import os, subprocess, errno, shutil, tempfile
+from threading import Thread

 from PyQt5.Qt import QImage, QByteArray, QBuffer, Qt, QPainter

-from calibre import fit_image
+from calibre import fit_image, force_unicode
+from calibre.constants import iswindows
+from calibre.utils.filenames import atomic_rename
+
+def get_exe_path(name):
+    from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
+    base = os.path.dirname(PDFTOHTML)
+    if iswindows:
+        name += '-calibre.exe'
+    if not base:
+        return name
+    return os.path.join(base, name)

 def image_from_data(data):
    i = QImage()
@ -48,3 +61,80 @@ def scale_image(data, width=60, height=80, compression_quality=70, as_png=False,
    if not img.save(buf, fmt, quality=compression_quality):
        raise ValueError('Failed to export thumbnail image to: ' + fmt)
    return img.width(), img.height(), ba.data()
+
+
+def run_optimizer(file_path, cmd, as_filter=False, input_data=None):
+    file_path = os.path.abspath(file_path)
+    cwd = os.path.dirname(file_path)
+    fd, outfile = tempfile.mkstemp(dir=cwd)
+    try:
+        if as_filter:
+            outf = os.fdopen(fd, 'wb')
+        else:
+            os.close(fd)
+        iname, oname = os.path.basename(file_path), os.path.basename(outfile)
+        def repl(q, r):
+            cmd[cmd.index(q)] = r
+        if not as_filter:
+            repl(True, iname), repl(False, oname)
+        if iswindows:
+            # subprocess in python 2 cannot handle unicode strings that are not
+            # encodeable in mbcs, so we fail here, where it is more explicit,
+            # instead.
+            cmd = [x.encode('mbcs') if isinstance(x, type('')) else x for x in cmd]
+            if isinstance(cwd, type('')):
+                cwd = cwd.encode('mbcs')
+        stdin = subprocess.PIPE if as_filter else None
+        stderr = subprocess.PIPE if as_filter else subprocess.STDOUT
+        p = subprocess.Popen(cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=stderr, stdin=stdin)
+        stderr = p.stderr if as_filter else p.stdout
+        if as_filter:
+            src = input_data or open(file_path, 'rb')
+            def copy(src, dest):
+                try:
+                    shutil.copyfileobj(src, dest)
+                finally:
+                    src.close(), dest.close()
+            inw = Thread(name='CopyInput', target=copy, args=(src, p.stdin))
+            inw.daemon = True
+            inw.start()
+            outw = Thread('CopyOutput', target=copy, args=(p.stdout, outf))
+            outw.daemon = True
+            outw.start()
+        raw = force_unicode(stderr.read())
+        if p.wait() != 0:
+            return raw
+        else:
+            shutil.copystat(file_path, outfile)
+            atomic_rename(outfile, file_path)
+    finally:
+        try:
+            os.remove(outfile)
+        except EnvironmentError as err:
+            if err.errno != errno.ENOENT:
+                raise
+
+def optimize_jpeg(file_path):
+    exe = get_exe_path('jpegtran')
+    cmd = [exe] + '-copy none -optimize -progressive -maxmemory 100M -outfile'.split() + [False, True]
+    return run_optimizer(file_path, cmd)
+
+def optimize_png(file_path):
+    exe = get_exe_path('optipng')
+    cmd = [exe] + '-fix -clobber -strip all -o7 -out'.split() + [False, True]
+    return run_optimizer(file_path, cmd)
+
+def encode_jpeg(file_path, quality=80):
+    from calibre.srv.utils import ReadOnlyFileBuffer
+    quality = max(0, min(100, int(quality)))
+    exe = get_exe_path('cjpeg')
+    cmd = [exe] + '-optimize -progressive -maxmemory 100M -quality'.split() + [str(quality)]
+    img = QImage()
+    if not img.load(file_path):
+        raise ValueError('%s is not a valid image file' % file_path)
+    ba = QByteArray()
+    buf = QBuffer(ba)
+    buf.open(QBuffer.WriteOnly)
+    if not img.save(buf, 'PPM'):
+        raise ValueError('Failed to export image to PPM')
+    return run_optimizer(file_path, cmd, as_filter=True, input_data=ReadOnlyFileBuffer(ba.data))
--- a/src/pyj/srv.pyj
+++ b/src/pyj/srv.pyj
@ -44,6 +44,7 @@ def load_book_list():

 def on_load():
    if window.calibre_entry_point == 'book list':
+        print('calibre loaded at:', Date().toString())
        load_book_list()

 # We wait for all page elements to load, since this is a single page app