From 62211f40069e8ab510c3a0cab6e5178d8851e75b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 17 Mar 2013 11:58:33 +0530
Subject: [PATCH] News download: Add an option recipe authors can set to have
 calibre automatically reduce the size of downloaded images by lowering their
 quality

---
 src/calibre/web/feeds/news.py   | 45 +++++++++++++++++++++++++++++-
 src/calibre/web/fetch/simple.py | 49 +++++++++++++++++++++++++++++++--
 2 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 5bf09d8a3b..e9348f6ae7 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -338,6 +338,41 @@ class BasicNewsRecipe(Recipe):
     #: :meth:`javascript_login` method, to do the actual logging in.
     use_javascript_to_login = False
 
+    # The following parameters control how the recipe attempts to minimize
+    # jpeg image sizes
+
+    #: Set this to False to ignore all scaling and compression parameters and
+    #: pass images through unmodified. If True and the other compression
+    #: parameters are left at their default values, jpeg images will be scaled to fit
+    #: in the screen dimensions set by the output profile and compressed to size at
+    #: most (w * h)/16 where w x h are the scaled image dimensions.
+    compress_news_images = False
+
+    #: The factor used when auto compressing jpeg images. If set to None,
+    #: auto compression is disabled. Otherwise, the images will be reduced in size to
+    #: (w * h)/compress_news_images_auto_size bytes if possible by reducing
+    #: the quality level, where w x h are the image dimensions in pixels.
+    #: The minimum jpeg quality will be 5/100 so it is possible this constraint
+    #: will not be met.  This parameter can be overridden by the parameter
+    #: compress_news_images_max_size which provides a fixed maximum size for images.
+    compress_news_images_auto_size = 16
+
+    #: Set jpeg quality so images do not exceed the size given (in KBytes).
+    #: If set, this parameter overrides auto compression via compress_news_images_auto_size.
+    #: The minimum jpeg quality will be 5/100 so it is possible this constraint
+    #: will not be met.
+    compress_news_images_max_size = None
+
+    #: Rescale images to fit in the device screen dimensions set by the output profile.
+    #: Ignored if no output profile is set.
+    scale_news_images_to_device = True
+
+    #: Maximum dimensions (w,h) to scale images to. If scale_news_images_to_device is True
+    #: this is set to the device screen dimensions set by the output profile unless
+    #: there is no profile set, in which case it is left at whatever value it has been
+    #: assigned (default None).
+    scale_news_images = None
+
     # See the built-in profiles for examples of these settings.
 
     def short_title(self):
@@ -849,11 +884,19 @@ class BasicNewsRecipe(Recipe):
         for reg in self.filter_regexps:
             web2disk_cmdline.extend(['--filter-regexp', reg])
 
+        if options.output_profile.short_name == 'default':
+            self.scale_news_images_to_device = False
+        elif self.scale_news_images_to_device:
+            self.scale_news_images = options.output_profile.screen_size
+
         self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
         for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
                       'skip_ad_pages', 'preprocess_html', 'remove_tags_after',
-                      'remove_tags_before', 'is_link_wanted'):
+                      'remove_tags_before', 'is_link_wanted',
+                      'compress_news_images', 'compress_news_images_max_size',
+                      'compress_news_images_auto_size', 'scale_news_images'):
             setattr(self.web2disk_options, extra, getattr(self, extra))
+
         self.web2disk_options.postprocess_html = self._postprocess_html
         self.web2disk_options.encoding = self.encoding
         self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index e7ad119dae..7cc8bd9309 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -12,7 +12,7 @@ from urllib import url2pathname, quote
 from httplib import responses
 from base64 import b64decode
 
-from calibre import browser, relpath, unicode_path
+from calibre import browser, relpath, unicode_path, fit_image
 from calibre.constants import filesystem_encoding, iswindows
 from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
@@ -20,7 +20,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import Log
 from calibre.utils.magick import Image
-from calibre.utils.magick.draw import identify_data
+from calibre.utils.magick.draw import identify_data, thumbnail
 
 class FetchError(Exception):
     pass
@@ -142,6 +142,10 @@ class RecursiveFetcher(object):
         self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
         self._is_link_wanted     = getattr(options, 'is_link_wanted',
                 default_is_link_wanted)
+        self.compress_news_images_max_size = getattr(options, 'compress_news_images_max_size', None)
+        self.compress_news_images = getattr(options, 'compress_news_images', False)
+        self.compress_news_images_auto_size = getattr(options, 'compress_news_images_auto_size', 16)
+        self.scale_news_images = getattr(options, 'scale_news_images', None)
         self.download_stylesheets = not options.no_stylesheets
         self.show_progress = True
         self.failed_links = []
@@ -338,7 +342,42 @@ class RecursiveFetcher(object):
                             x.write(data)
                         ns.replaceWith(src.replace(m.group(1), stylepath))
 
+    def rescale_image(self, data):
+        orig_w, orig_h, ifmt = identify_data(data)
+        orig_data = data # save it in case compression fails
+        if self.scale_news_images is not None:
+            wmax, hmax = self.scale_news_images
+            scale, new_w, new_h = fit_image(orig_w, orig_h, wmax, hmax)
+            if scale:
+                data = thumbnail(data, new_w, new_h, compression_quality=95)[-1]
+                orig_w = new_w
+                orig_h = new_h
+        if self.compress_news_images_max_size is None:
+            if self.compress_news_images_auto_size is None: # not compressing
+                return data
+            else:
+                maxsizeb = (orig_w * orig_h)/self.compress_news_images_auto_size
+        else:
+            maxsizeb = self.compress_news_images_max_size * 1024
+        scaled_data = data # save it in case compression fails
+        if len(scaled_data) <= maxsizeb: # no compression required
+            return scaled_data
 
+        img = Image()
+        quality = 95
+        img.load(data)
+        while len(data) >= maxsizeb and quality >= 5:
+            quality -= 5
+            img.set_compression_quality(quality)
+            data = img.export('jpg')
+
+        if len(data) >= len(scaled_data): # compression failed
+            return orig_data if len(orig_data) <= len(scaled_data) else scaled_data
+
+        if len(data) >= len(orig_data): # no improvement
+            return orig_data
+
+        return data
 
     def process_images(self, soup, baseurl):
         diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
@@ -390,6 +429,12 @@ class RecursiveFetcher(object):
                         im = Image()
                         im.load(data)
                         data = im.export(itype)
+                    if self.compress_news_images and itype in {'jpg','jpeg'}:
+                        try:
+                            data = self.rescale_image(data)
+                        except:
+                            self.log.exception('failed to compress image '+iurl)
+                            identify_data(data)
                     else:
                         identify_data(data)
                     imgpath = os.path.join(diskpath, fname+'.'+itype)