Metadata download: Add a plugin to download book covers from a google image search. Go to Preferences->Metadata download and enable the plugin to use it. Google Image search often finds larger and/or different covers from the other sources, however, it sometimes finds junk. Use at your discretion.

2025-08-11 09:13:57 -04:00 · 2013-04-01 22:54:09 +05:30 · 2013-04-01 22:54:09 +05:30 · 9ba0272b0c
commit 9ba0272b0c
parent 1663619cef
14 changed files with 263 additions and 60 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -757,8 +757,9 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
 from calibre.ebooks.metadata.sources.overdrive import OverDrive
 from calibre.ebooks.metadata.sources.douban import Douban
 from calibre.ebooks.metadata.sources.ozon import Ozon
+from calibre.ebooks.metadata.sources.google_images import GoogleImages

-plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
+plugins += [GoogleBooks, Amazon, Edelweiss, GoogleImages, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]

 # }}}

--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
    config['enabled_plugins'] = ep

 default_disabled_plugins = set([
-    'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss',
+    'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images',
 ])

 def is_disabled(plugin):
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -858,7 +858,7 @@ class Amazon(Source):
    # }}}

    def download_cover(self, log, result_queue, abort, # {{{
-            title=None, authors=None, identifiers={}, timeout=30):
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
 # Google covers are often poor quality (scans/errors) but they have high
 # resolution, so they trump covers from better sources. So make sure they
 # are only used if no other covers are found.
-msprefs.defaults['cover_priorities'] = {'Google':2}
+msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2}

 def create_log(ostream=None):
    from calibre.utils.logging import ThreadSafeLog, FileStream
@ -222,6 +222,9 @@ class Source(Plugin):
    #: plugin
    config_help_message = None

+    #: If True this source can return multiple covers for a given query
+    can_get_multiple_covers = False
+

    def __init__(self, *args, **kwargs):
        Plugin.__init__(self, *args, **kwargs)
@ -522,7 +525,7 @@ class Source(Plugin):
        return None

    def download_cover(self, log, result_queue, abort,
-            title=None, authors=None, identifiers={}, timeout=30):
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        '''
        Download a cover and put it into result_queue. The parameters all have
        the same meaning as for :meth:`identify`. Put (self, cover_data) into
@ -531,6 +534,9 @@ class Source(Plugin):
        This method should use cached cover URLs for efficiency whenever
        possible. When cached data is not present, most plugins simply call
        identify and use its results.
+
+        If the parameter get_best_cover is True and this plugin can get
+        multiple covers, it should only get the "best" one.
        '''
        pass

--- a/src/calibre/ebooks/metadata/sources/covers.py
+++ b/src/calibre/ebooks/metadata/sources/covers.py
@ -35,9 +35,14 @@ class Worker(Thread):
        start_time = time.time()
        if not self.abort.is_set():
            try:
-                self.plugin.download_cover(self.log, self.rq, self.abort,
-                    title=self.title, authors=self.authors,
-                    identifiers=self.identifiers, timeout=self.timeout)
+                if self.plugin.can_get_multiple_covers:
+                    self.plugin.download_cover(self.log, self.rq, self.abort,
+                        title=self.title, authors=self.authors, get_best_cover=True,
+                        identifiers=self.identifiers, timeout=self.timeout)
+                else:
+                    self.plugin.download_cover(self.log, self.rq, self.abort,
+                        title=self.title, authors=self.authors,
+                        identifiers=self.identifiers, timeout=self.timeout)
            except:
                self.log.exception('Failed to download cover from',
                        self.plugin.name)
--- a/src/calibre/ebooks/metadata/sources/douban.py
+++ b/src/calibre/ebooks/metadata/sources/douban.py
@ -221,7 +221,7 @@ class Douban(Source):
    # }}}

    def download_cover(self, log, result_queue, abort, # {{{
-            title=None, authors=None, identifiers={}, timeout=30):
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
--- a/src/calibre/ebooks/metadata/sources/edelweiss.py
+++ b/src/calibre/ebooks/metadata/sources/edelweiss.py
@ -320,7 +320,7 @@ class Edelweiss(Source):
    # }}}

    def download_cover(self, log, result_queue, abort, # {{{
-            title=None, authors=None, identifiers={}, timeout=30):
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -209,7 +209,7 @@ class GoogleBooks(Source):
    # }}}

    def download_cover(self, log, result_queue, abort, # {{{
-            title=None, authors=None, identifiers={}, timeout=30):
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
--- a/src/calibre/ebooks/metadata/sources/google_images.py
+++ b/src/calibre/ebooks/metadata/sources/google_images.py
@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from collections import OrderedDict
+
+from calibre import as_unicode
+from calibre.ebooks.metadata.sources.base import Source, Option
+
+class GoogleImages(Source):
+
+    name = 'Google Images'
+    description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')
+    capabilities = frozenset(['cover'])
+    config_help_message = _('Configure the Google Image Search plugin')
+    can_get_multiple_covers = True
+    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
+                      _('The maximum number of covers to process from the google search result')),
+               Option('size', 'choices', 'svga', _('Cover size'),
+                      _('Search for covers larger than the specified size'),
+                      choices=OrderedDict((
+                          ('any', _('Any size'),),
+                          ('l', _('Large'),),
+                          ('qsvga', _('Larger than %s')%'400x300',),
+                          ('vga', _('Larger than %s')%'640x480',),
+                          ('svga', _('Larger than %s')%'600x800',),
+                          ('xga', _('Larger than %s')%'1024x768',),
+                          ('2mp', _('Larger than %s')%'2 MP',),
+                          ('4mp', _('Larger than %s')%'4 MP',),
+                      ))),
+    )
+
+    def download_cover(self, log, result_queue, abort,
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
+        if not title:
+            return
+        from threading import Thread
+        import time
+        timeout = max(60, timeout) # Needs at least a minute
+        title = ' '.join(self.get_title_tokens(title))
+        author = ' '.join(self.get_author_tokens(authors))
+        urls = self.get_image_urls(title, author, log, abort, timeout)
+        if not urls:
+            log('No images found in Google for, title: %r and authors: %r'%(title, author))
+            return
+        urls = urls[:self.prefs['max_covers']]
+        if get_best_cover:
+            urls = urls[:1]
+        workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
+        for w in workers:
+            w.daemon = True
+            w.start()
+        alive = True
+        start_time = time.time()
+        while alive and not abort.is_set() and time.time() - start_time < timeout:
+            alive = False
+            for w in workers:
+                if w.is_alive():
+                    alive = True
+                    break
+            abort.wait(0.1)
+
+    def download_image(self, url, timeout, log, result_queue):
+        try:
+            ans = self.browser.open_novisit(url, timeout=timeout).read()
+            result_queue.put((self, ans))
+            log('Downloaded cover from: %s'%url)
+        except Exception:
+            self.log.exception('Failed to download cover from: %r'%url)
+
+    def get_image_urls(self, title, author, log, abort, timeout):
+        from calibre.utils.ipc.simple_worker import fork_job, WorkerError
+        try:
+            return fork_job('calibre.ebooks.metadata.sources.google_images',
+                    'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result']
+        except WorkerError as e:
+            if e.orig_tb:
+                log.error(e.orig_tb)
+            log.exception('Searching google failed:' + as_unicode(e))
+        except Exception as e:
+            log.exception('Searching google failed:' + as_unicode(e))
+
+        return []
+
+USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Firefox/3.6.13'
+
+def find_image_urls(br, ans):
+    import urlparse
+    for w in br.page.mainFrame().documentElement().findAll('.images_table a[href]'):
+        try:
+            imgurl = urlparse.parse_qs(urlparse.urlparse(unicode(w.attribute('href'))).query)['imgurl'][0]
+        except:
+            continue
+        if imgurl not in ans:
+            ans.append(imgurl)
+
+def search(title, author, size, timeout, debug=False):
+    import time
+    from calibre.web.jsbrowser.browser import Browser, LoadWatcher, Timeout
+    ans = []
+    start_time = time.time()
+    br = Browser(user_agent=USER_AGENT, enable_developer_tools=debug)
+    br.visit('https://www.google.com/advanced_image_search')
+    f = br.select_form('form[action="/search"]')
+    f['as_q'] = '%s %s'%(title, author)
+    if size != 'any':
+        f['imgsz'] = size
+    f['imgar'] = 't|xt'
+    f['as_filetype'] = 'jpg'
+    br.submit(wait_for_load=False)
+
+    # Loop until the page finishes loading or at least five image urls are
+    # found
+    lw = LoadWatcher(br.page, br)
+    while lw.is_loading and len(ans) < 5:
+        br.run_for_a_time(0.2)
+        find_image_urls(br, ans)
+        if time.time() - start_time > timeout:
+            raise Timeout('Timed out trying to load google image search page')
+    find_image_urls(br, ans)
+    if debug:
+        br.show_browser()
+    br.close()
+    del br # Needed to prevent PyQt from segfaulting
+    return ans
+
+def test_google():
+    import pprint
+    pprint.pprint(search('heroes', 'abercrombie', 'svga', 60, debug=True))
+
+def test():
+    from Queue import Queue
+    from threading import Event
+    from calibre.utils.logging import default_log
+    p = GoogleImages(None)
+    rq = Queue()
+    p.download_cover(default_log, rq, Event(), title='The Heroes',
+                     authors=('Joe Abercrombie',))
+    print ('Downloaded', rq.qsize(), 'covers')
+
+if __name__ == '__main__':
+    test()
+
--- a/src/calibre/ebooks/metadata/sources/openlibrary.py
+++ b/src/calibre/ebooks/metadata/sources/openlibrary.py
@ -19,7 +19,7 @@ class OpenLibrary(Source):
    OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'

    def download_cover(self, log, result_queue, abort,
-            title=None, authors=None, identifiers={}, timeout=30):
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        if 'isbn' not in identifiers:
            return
        isbn = identifiers['isbn']
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@ -75,7 +75,7 @@ class OverDrive(Source):
    # }}}

    def download_cover(self, log, result_queue, abort, # {{{
-            title=None, authors=None, identifiers={}, timeout=30):
+            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        import mechanize
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@ -55,7 +55,7 @@ class Ozon(Source):
        # for ozon.ru search we have to format ISBN with '-'
        isbn = _format_isbn(log, identifiers.get('isbn', None))
        ozonid = identifiers.get('ozon', None)
-        
+
        unk = unicode(_('Unknown')).upper()
        if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
            qItems = set([isbn, title])
@ -64,19 +64,19 @@ class Ozon(Source):
            qItems.discard(None)
            qItems.discard('')
            qItems = map(_quoteString, qItems)
-    
+
            q = u' '.join(qItems).strip()
            log.info(u'search string: ' + q)
-    
+
            if isinstance(q, unicode):
                q = q.encode('utf-8')
            if not q:
                return None
-    
+
            search_url += quote_plus(q)
        else:
            search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
-            
+
        log.debug(u'search url: %r'%search_url)
        return search_url
    # }}}
@ -250,7 +250,7 @@ class Ozon(Source):
        return url
    # }}}

-    def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # {{{
+    def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.debug('No cached cover found, running identify')
--- a/src/calibre/ebooks/metadata/sources/worker.py
+++ b/src/calibre/ebooks/metadata/sources/worker.py
@ -11,6 +11,7 @@ import os
 from threading import Event, Thread
 from Queue import Queue, Empty
 from io import BytesIO
+from collections import Counter

 from calibre.utils.date import as_utc
 from calibre.ebooks.metadata.sources.identify import identify, msprefs
@ -113,13 +114,18 @@ def single_covers(title, authors, identifiers, caches, tdir):
            kwargs=dict(title=title, authors=authors, identifiers=identifiers))
    worker.daemon = True
    worker.start()
+    c = Counter()
    while worker.is_alive():
        try:
            plugin, width, height, fmt, data = results.get(True, 1)
        except Empty:
            continue
        else:
-            name = '%s,,%s,,%s,,%s.cover'%(plugin.name, width, height, fmt)
+            name = plugin.name
+            if plugin.can_get_multiple_covers:
+                name += '{%d}'%c[plugin.name]
+                c[plugin.name] += 1
+            name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt)
            with open(name, 'wb') as f:
                f.write(data)
            os.mkdir(name+'.done')
--- a/src/calibre/gui2/metadata/single_download.py
+++ b/src/calibre/gui2/metadata/single_download.py
@ -16,13 +16,12 @@ from operator import attrgetter
 from Queue import Queue, Empty
 from io import BytesIO

-from PyQt4.Qt import (QStyledItemDelegate, QTextDocument, QRectF, QIcon, Qt,
-                      QApplication, QDialog, QVBoxLayout, QLabel,
-                      QDialogButtonBox, QStyle, QStackedWidget, QWidget,
-                      QTableView, QGridLayout, QFontInfo, QPalette, QTimer,
-                      pyqtSignal, QAbstractTableModel, QVariant, QSize,
-                      QListView, QPixmap, QAbstractListModel, QColor, QRect,
-                      QTextBrowser, QStringListModel)
+from PyQt4.Qt import (
+    QStyledItemDelegate, QTextDocument, QRectF, QIcon, Qt, QApplication,
+    QDialog, QVBoxLayout, QLabel, QDialogButtonBox, QStyle, QStackedWidget,
+    QWidget, QTableView, QGridLayout, QFontInfo, QPalette, QTimer, pyqtSignal,
+    QAbstractTableModel, QVariant, QSize, QListView, QPixmap, QModelIndex,
+    QAbstractListModel, QColor, QRect, QTextBrowser, QStringListModel)
 from PyQt4.QtWebKit import QWebView

 from calibre.customize.ui import metadata_plugins
@ -654,7 +653,7 @@ class CoversModel(QAbstractListModel): # {{{
        for i, plugin in enumerate(metadata_plugins(['cover'])):
            self.covers.append((plugin.name+'\n'+_('Searching...'),
                QVariant(self.blank), None, True))
-            self.plugin_map[plugin] = i+1
+            self.plugin_map[plugin] = [i+1]

        if do_reset:
            self.reset()
@ -685,48 +684,82 @@ class CoversModel(QAbstractListModel): # {{{
    def plugin_for_index(self, index):
        row = index.row() if hasattr(index, 'row') else index
        for k, v in self.plugin_map.iteritems():
-            if v == row:
+            if row in v:
                return k

-    def cover_keygen(self, x):
-        pmap = x[2]
-        if pmap is None:
-            return 1
-        return pmap.width()*pmap.height()
-
    def clear_failed(self):
+        # Remove entries that are still waiting
        good = []
        pmap = {}
-        dcovers = sorted(self.covers[1:], key=self.cover_keygen, reverse=True)
-        cmap = {x:self.covers.index(x) for x in self.covers}
+        def keygen(x):
+            pmap = x[2]
+            if pmap is None:
+                return 1
+            return pmap.width()*pmap.height()
+        dcovers = sorted(self.covers[1:], key=keygen, reverse=True)
+        cmap = {i:self.plugin_for_index(i) for i in xrange(len(self.covers))}
        for i, x in enumerate(self.covers[0:1] + dcovers):
            if not x[-1]:
                good.append(x)
-                if i > 0:
-                    plugin = self.plugin_for_index(cmap[x])
-                    pmap[plugin] = len(good) - 1
+                plugin = cmap[i]
+                if plugin is not None:
+                    try:
+                        pmap[plugin].append(len(good) - 1)
+                    except KeyError:
+                        pmap[plugin] = [len(good)-1]
        self.covers = good
        self.plugin_map = pmap
        self.reset()

-    def index_for_plugin(self, plugin):
-        idx = self.plugin_map.get(plugin, 0)
-        return self.index(idx)
+    def pointer_from_index(self, index):
+        row = index.row() if hasattr(index, 'row') else index
+        try:
+            return self.covers[row][2]
+        except IndexError:
+            pass
+
+    def index_from_pointer(self, pointer):
+        for r, (text, scaled, pmap, waiting) in enumerate(self.covers):
+            if pointer == pmap:
+                return self.index(r)
+        return self.index(0)

    def update_result(self, plugin_name, width, height, data):
-        idx = None
-        for plugin, i in self.plugin_map.iteritems():
-            if plugin.name == plugin_name:
-                idx = i
-                break
-        if idx is None:
-            return
-        pmap = QPixmap()
-        pmap.loadFromData(data)
-        if pmap.isNull():
-            return
-        self.covers[idx] = self.get_item(plugin_name, pmap, waiting=False)
-        self.dataChanged.emit(self.index(idx), self.index(idx))
+        if plugin_name.endswith('}'):
+            # multi cover plugin
+            plugin_name = plugin_name.partition('{')[0]
+            plugin = [plugin for plugin in self.plugin_map if plugin.name == plugin_name]
+            if not plugin:
+                return
+            plugin = plugin[0]
+            last_row = max(self.plugin_map[plugin])
+            pmap = QPixmap()
+            pmap.loadFromData(data)
+            if pmap.isNull():
+                return
+            self.beginInsertRows(QModelIndex(), last_row, last_row)
+            for rows in self.plugin_map.itervalues():
+                for i in xrange(len(rows)):
+                    if rows[i] >= last_row:
+                        rows[i] += 1
+            self.plugin_map[plugin].insert(-1, last_row)
+            self.covers.insert(last_row, self.get_item(plugin_name, pmap, waiting=False))
+            self.endInsertRows()
+        else:
+            # single cover plugin
+            idx = None
+            for plugin, rows in self.plugin_map.iteritems():
+                if plugin.name == plugin_name:
+                    idx = rows[0]
+                    break
+            if idx is None:
+                return
+            pmap = QPixmap()
+            pmap.loadFromData(data)
+            if pmap.isNull():
+                return
+            self.covers[idx] = self.get_item(plugin_name, pmap, waiting=False)
+            self.dataChanged.emit(self.index(idx), self.index(idx))

    def cover_pixmap(self, index):
        row = index.row()
@ -774,9 +807,12 @@ class CoversView(QListView): # {{{
        self.m.reset_covers()

    def clear_failed(self):
-        plugin = self.m.plugin_for_index(self.currentIndex())
+        pointer = self.m.pointer_from_index(self.currentIndex())
        self.m.clear_failed()
-        self.select(self.m.index_for_plugin(plugin).row())
+        if pointer is None:
+            self.select(0)
+        else:
+            self.select(self.m.index_from_pointer(pointer).row())

 # }}}

@ -852,10 +888,11 @@ class CoversWidget(QWidget): # {{{
        if num < 2:
            txt = _('Could not find any covers for <b>%s</b>')%self.book.title
        else:
-            txt = _('Found <b>%(num)d</b> covers of %(title)s. '
-                    'Pick the one you like best.')%dict(num=num-1,
+            txt = _('Found <b>%(num)d</b> possible covers for %(title)s. '
+                    'When the download completes, the covers will be sorted by size.')%dict(num=num-1,
                            title=self.title)
        self.msg.setText(txt)
+        self.msg.setWordWrap(True)

        self.finished.emit()