From 9ba0272b0c1768980c0d5cff8275dbb83d0bcf5b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 1 Apr 2013 22:54:09 +0530 Subject: [PATCH] Metadata download: Add a plugin to download book covers from a google image search. Go to Preferences->Metadata download and enable the plugin to use it. Google Image search often finds larger and/or different covers from the other sources, however, it sometimes finds junk. Use at your discretion. --- src/calibre/customize/builtins.py | 3 +- src/calibre/customize/ui.py | 2 +- src/calibre/ebooks/metadata/sources/amazon.py | 2 +- src/calibre/ebooks/metadata/sources/base.py | 10 +- src/calibre/ebooks/metadata/sources/covers.py | 11 +- src/calibre/ebooks/metadata/sources/douban.py | 2 +- .../ebooks/metadata/sources/edelweiss.py | 2 +- src/calibre/ebooks/metadata/sources/google.py | 2 +- .../ebooks/metadata/sources/google_images.py | 148 ++++++++++++++++++ .../ebooks/metadata/sources/openlibrary.py | 2 +- .../ebooks/metadata/sources/overdrive.py | 2 +- src/calibre/ebooks/metadata/sources/ozon.py | 12 +- src/calibre/ebooks/metadata/sources/worker.py | 8 +- src/calibre/gui2/metadata/single_download.py | 117 +++++++++----- 14 files changed, 263 insertions(+), 60 deletions(-) create mode 100644 src/calibre/ebooks/metadata/sources/google_images.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index e157c36c5e..c87c8c63d0 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -757,8 +757,9 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB from calibre.ebooks.metadata.sources.overdrive import OverDrive from calibre.ebooks.metadata.sources.douban import Douban from calibre.ebooks.metadata.sources.ozon import Ozon +from calibre.ebooks.metadata.sources.google_images import GoogleImages -plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon] +plugins += [GoogleBooks, Amazon, Edelweiss, GoogleImages, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon] # }}} diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 849d1a21f4..06fd2784e4 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', + 'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images', ]) def is_disabled(plugin): diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index a8e15a6d94..3fefe2d886 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -858,7 +858,7 @@ class Amazon(Source): # }}} def download_cover(self, log, result_queue, abort, # {{{ - title=None, authors=None, identifiers={}, timeout=30): + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index e15d11c3c1..41812af8eb 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False # Google covers are often poor quality (scans/errors) but they have high # resolution, so they trump covers from better sources. So make sure they # are only used if no other covers are found. -msprefs.defaults['cover_priorities'] = {'Google':2} +msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2} def create_log(ostream=None): from calibre.utils.logging import ThreadSafeLog, FileStream @@ -222,6 +222,9 @@ class Source(Plugin): #: plugin config_help_message = None + #: If True this source can return multiple covers for a given query + can_get_multiple_covers = False + def __init__(self, *args, **kwargs): Plugin.__init__(self, *args, **kwargs) @@ -522,7 +525,7 @@ class Source(Plugin): return None def download_cover(self, log, result_queue, abort, - title=None, authors=None, identifiers={}, timeout=30): + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): ''' Download a cover and put it into result_queue. The parameters all have the same meaning as for :meth:`identify`. Put (self, cover_data) into @@ -531,6 +534,9 @@ class Source(Plugin): This method should use cached cover URLs for efficiency whenever possible. When cached data is not present, most plugins simply call identify and use its results. + + If the parameter get_best_cover is True and this plugin can get + multiple covers, it should only get the "best" one. ''' pass diff --git a/src/calibre/ebooks/metadata/sources/covers.py b/src/calibre/ebooks/metadata/sources/covers.py index d28ce146c6..0fe963e3f7 100644 --- a/src/calibre/ebooks/metadata/sources/covers.py +++ b/src/calibre/ebooks/metadata/sources/covers.py @@ -35,9 +35,14 @@ class Worker(Thread): start_time = time.time() if not self.abort.is_set(): try: - self.plugin.download_cover(self.log, self.rq, self.abort, - title=self.title, authors=self.authors, - identifiers=self.identifiers, timeout=self.timeout) + if self.plugin.can_get_multiple_covers: + self.plugin.download_cover(self.log, self.rq, self.abort, + title=self.title, authors=self.authors, get_best_cover=True, + identifiers=self.identifiers, timeout=self.timeout) + else: + self.plugin.download_cover(self.log, self.rq, self.abort, + title=self.title, authors=self.authors, + identifiers=self.identifiers, timeout=self.timeout) except: self.log.exception('Failed to download cover from', self.plugin.name) diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py index 6857d62d4d..f955fb8a79 100644 --- a/src/calibre/ebooks/metadata/sources/douban.py +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -221,7 +221,7 @@ class Douban(Source): # }}} def download_cover(self, log, result_queue, abort, # {{{ - title=None, authors=None, identifiers={}, timeout=30): + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') diff --git a/src/calibre/ebooks/metadata/sources/edelweiss.py b/src/calibre/ebooks/metadata/sources/edelweiss.py index c86f16ff0d..53ae6c6ee3 100644 --- a/src/calibre/ebooks/metadata/sources/edelweiss.py +++ b/src/calibre/ebooks/metadata/sources/edelweiss.py @@ -320,7 +320,7 @@ class Edelweiss(Source): # }}} def download_cover(self, log, result_queue, abort, # {{{ - title=None, authors=None, identifiers={}, timeout=30): + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index 3962afcb5e..c03f20cd6b 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -209,7 +209,7 @@ class GoogleBooks(Source): # }}} def download_cover(self, log, result_queue, abort, # {{{ - title=None, authors=None, identifiers={}, timeout=30): + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') diff --git a/src/calibre/ebooks/metadata/sources/google_images.py b/src/calibre/ebooks/metadata/sources/google_images.py new file mode 100644 index 0000000000..c755fea192 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/google_images.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import OrderedDict + +from calibre import as_unicode +from calibre.ebooks.metadata.sources.base import Source, Option + +class GoogleImages(Source): + + name = 'Google Images' + description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.') + capabilities = frozenset(['cover']) + config_help_message = _('Configure the Google Image Search plugin') + can_get_multiple_covers = True + options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'), + _('The maximum number of covers to process from the google search result')), + Option('size', 'choices', 'svga', _('Cover size'), + _('Search for covers larger than the specified size'), + choices=OrderedDict(( + ('any', _('Any size'),), + ('l', _('Large'),), + ('qsvga', _('Larger than %s')%'400x300',), + ('vga', _('Larger than %s')%'640x480',), + ('svga', _('Larger than %s')%'600x800',), + ('xga', _('Larger than %s')%'1024x768',), + ('2mp', _('Larger than %s')%'2 MP',), + ('4mp', _('Larger than %s')%'4 MP',), + ))), + ) + + def download_cover(self, log, result_queue, abort, + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): + if not title: + return + from threading import Thread + import time + timeout = max(60, timeout) # Needs at least a minute + title = ' '.join(self.get_title_tokens(title)) + author = ' '.join(self.get_author_tokens(authors)) + urls = self.get_image_urls(title, author, log, abort, timeout) + if not urls: + log('No images found in Google for, title: %r and authors: %r'%(title, author)) + return + urls = urls[:self.prefs['max_covers']] + if get_best_cover: + urls = urls[:1] + workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls] + for w in workers: + w.daemon = True + w.start() + alive = True + start_time = time.time() + while alive and not abort.is_set() and time.time() - start_time < timeout: + alive = False + for w in workers: + if w.is_alive(): + alive = True + break + abort.wait(0.1) + + def download_image(self, url, timeout, log, result_queue): + try: + ans = self.browser.open_novisit(url, timeout=timeout).read() + result_queue.put((self, ans)) + log('Downloaded cover from: %s'%url) + except Exception: + self.log.exception('Failed to download cover from: %r'%url) + + def get_image_urls(self, title, author, log, abort, timeout): + from calibre.utils.ipc.simple_worker import fork_job, WorkerError + try: + return fork_job('calibre.ebooks.metadata.sources.google_images', + 'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result'] + except WorkerError as e: + if e.orig_tb: + log.error(e.orig_tb) + log.exception('Searching google failed:' + as_unicode(e)) + except Exception as e: + log.exception('Searching google failed:' + as_unicode(e)) + + return [] + +USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Firefox/3.6.13' + +def find_image_urls(br, ans): + import urlparse + for w in br.page.mainFrame().documentElement().findAll('.images_table a[href]'): + try: + imgurl = urlparse.parse_qs(urlparse.urlparse(unicode(w.attribute('href'))).query)['imgurl'][0] + except: + continue + if imgurl not in ans: + ans.append(imgurl) + +def search(title, author, size, timeout, debug=False): + import time + from calibre.web.jsbrowser.browser import Browser, LoadWatcher, Timeout + ans = [] + start_time = time.time() + br = Browser(user_agent=USER_AGENT, enable_developer_tools=debug) + br.visit('https://www.google.com/advanced_image_search') + f = br.select_form('form[action="/search"]') + f['as_q'] = '%s %s'%(title, author) + if size != 'any': + f['imgsz'] = size + f['imgar'] = 't|xt' + f['as_filetype'] = 'jpg' + br.submit(wait_for_load=False) + + # Loop until the page finishes loading or at least five image urls are + # found + lw = LoadWatcher(br.page, br) + while lw.is_loading and len(ans) < 5: + br.run_for_a_time(0.2) + find_image_urls(br, ans) + if time.time() - start_time > timeout: + raise Timeout('Timed out trying to load google image search page') + find_image_urls(br, ans) + if debug: + br.show_browser() + br.close() + del br # Needed to prevent PyQt from segfaulting + return ans + +def test_google(): + import pprint + pprint.pprint(search('heroes', 'abercrombie', 'svga', 60, debug=True)) + +def test(): + from Queue import Queue + from threading import Event + from calibre.utils.logging import default_log + p = GoogleImages(None) + rq = Queue() + p.download_cover(default_log, rq, Event(), title='The Heroes', + authors=('Joe Abercrombie',)) + print ('Downloaded', rq.qsize(), 'covers') + +if __name__ == '__main__': + test() + diff --git a/src/calibre/ebooks/metadata/sources/openlibrary.py b/src/calibre/ebooks/metadata/sources/openlibrary.py index 4645d2a18a..b0eeb940a5 100644 --- a/src/calibre/ebooks/metadata/sources/openlibrary.py +++ b/src/calibre/ebooks/metadata/sources/openlibrary.py @@ -19,7 +19,7 @@ class OpenLibrary(Source): OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false' def download_cover(self, log, result_queue, abort, - title=None, authors=None, identifiers={}, timeout=30): + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): if 'isbn' not in identifiers: return isbn = identifiers['isbn'] diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 6d6ebd3990..b232c7c9a4 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -75,7 +75,7 @@ class OverDrive(Source): # }}} def download_cover(self, log, result_queue, abort, # {{{ - title=None, authors=None, identifiers={}, timeout=30): + title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): import mechanize cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index ebb104818f..0f4b0c2c53 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -55,7 +55,7 @@ class Ozon(Source): # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) ozonid = identifiers.get('ozon', None) - + unk = unicode(_('Unknown')).upper() if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid: qItems = set([isbn, title]) @@ -64,19 +64,19 @@ class Ozon(Source): qItems.discard(None) qItems.discard('') qItems = map(_quoteString, qItems) - + q = u' '.join(qItems).strip() log.info(u'search string: ' + q) - + if isinstance(q, unicode): q = q.encode('utf-8') if not q: return None - + search_url += quote_plus(q) else: search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid - + log.debug(u'search url: %r'%search_url) return search_url # }}} @@ -250,7 +250,7 @@ class Ozon(Source): return url # }}} - def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # {{{ + def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{ cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.debug('No cached cover found, running identify') diff --git a/src/calibre/ebooks/metadata/sources/worker.py b/src/calibre/ebooks/metadata/sources/worker.py index 48f0f99584..51fb883e7d 100644 --- a/src/calibre/ebooks/metadata/sources/worker.py +++ b/src/calibre/ebooks/metadata/sources/worker.py @@ -11,6 +11,7 @@ import os from threading import Event, Thread from Queue import Queue, Empty from io import BytesIO +from collections import Counter from calibre.utils.date import as_utc from calibre.ebooks.metadata.sources.identify import identify, msprefs @@ -113,13 +114,18 @@ def single_covers(title, authors, identifiers, caches, tdir): kwargs=dict(title=title, authors=authors, identifiers=identifiers)) worker.daemon = True worker.start() + c = Counter() while worker.is_alive(): try: plugin, width, height, fmt, data = results.get(True, 1) except Empty: continue else: - name = '%s,,%s,,%s,,%s.cover'%(plugin.name, width, height, fmt) + name = plugin.name + if plugin.can_get_multiple_covers: + name += '{%d}'%c[plugin.name] + c[plugin.name] += 1 + name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt) with open(name, 'wb') as f: f.write(data) os.mkdir(name+'.done') diff --git a/src/calibre/gui2/metadata/single_download.py b/src/calibre/gui2/metadata/single_download.py index e4a78b674a..ffa83b6ea8 100644 --- a/src/calibre/gui2/metadata/single_download.py +++ b/src/calibre/gui2/metadata/single_download.py @@ -16,13 +16,12 @@ from operator import attrgetter from Queue import Queue, Empty from io import BytesIO -from PyQt4.Qt import (QStyledItemDelegate, QTextDocument, QRectF, QIcon, Qt, - QApplication, QDialog, QVBoxLayout, QLabel, - QDialogButtonBox, QStyle, QStackedWidget, QWidget, - QTableView, QGridLayout, QFontInfo, QPalette, QTimer, - pyqtSignal, QAbstractTableModel, QVariant, QSize, - QListView, QPixmap, QAbstractListModel, QColor, QRect, - QTextBrowser, QStringListModel) +from PyQt4.Qt import ( + QStyledItemDelegate, QTextDocument, QRectF, QIcon, Qt, QApplication, + QDialog, QVBoxLayout, QLabel, QDialogButtonBox, QStyle, QStackedWidget, + QWidget, QTableView, QGridLayout, QFontInfo, QPalette, QTimer, pyqtSignal, + QAbstractTableModel, QVariant, QSize, QListView, QPixmap, QModelIndex, + QAbstractListModel, QColor, QRect, QTextBrowser, QStringListModel) from PyQt4.QtWebKit import QWebView from calibre.customize.ui import metadata_plugins @@ -654,7 +653,7 @@ class CoversModel(QAbstractListModel): # {{{ for i, plugin in enumerate(metadata_plugins(['cover'])): self.covers.append((plugin.name+'\n'+_('Searching...'), QVariant(self.blank), None, True)) - self.plugin_map[plugin] = i+1 + self.plugin_map[plugin] = [i+1] if do_reset: self.reset() @@ -685,48 +684,82 @@ class CoversModel(QAbstractListModel): # {{{ def plugin_for_index(self, index): row = index.row() if hasattr(index, 'row') else index for k, v in self.plugin_map.iteritems(): - if v == row: + if row in v: return k - def cover_keygen(self, x): - pmap = x[2] - if pmap is None: - return 1 - return pmap.width()*pmap.height() - def clear_failed(self): + # Remove entries that are still waiting good = [] pmap = {} - dcovers = sorted(self.covers[1:], key=self.cover_keygen, reverse=True) - cmap = {x:self.covers.index(x) for x in self.covers} + def keygen(x): + pmap = x[2] + if pmap is None: + return 1 + return pmap.width()*pmap.height() + dcovers = sorted(self.covers[1:], key=keygen, reverse=True) + cmap = {i:self.plugin_for_index(i) for i in xrange(len(self.covers))} for i, x in enumerate(self.covers[0:1] + dcovers): if not x[-1]: good.append(x) - if i > 0: - plugin = self.plugin_for_index(cmap[x]) - pmap[plugin] = len(good) - 1 + plugin = cmap[i] + if plugin is not None: + try: + pmap[plugin].append(len(good) - 1) + except KeyError: + pmap[plugin] = [len(good)-1] self.covers = good self.plugin_map = pmap self.reset() - def index_for_plugin(self, plugin): - idx = self.plugin_map.get(plugin, 0) - return self.index(idx) + def pointer_from_index(self, index): + row = index.row() if hasattr(index, 'row') else index + try: + return self.covers[row][2] + except IndexError: + pass + + def index_from_pointer(self, pointer): + for r, (text, scaled, pmap, waiting) in enumerate(self.covers): + if pointer == pmap: + return self.index(r) + return self.index(0) def update_result(self, plugin_name, width, height, data): - idx = None - for plugin, i in self.plugin_map.iteritems(): - if plugin.name == plugin_name: - idx = i - break - if idx is None: - return - pmap = QPixmap() - pmap.loadFromData(data) - if pmap.isNull(): - return - self.covers[idx] = self.get_item(plugin_name, pmap, waiting=False) - self.dataChanged.emit(self.index(idx), self.index(idx)) + if plugin_name.endswith('}'): + # multi cover plugin + plugin_name = plugin_name.partition('{')[0] + plugin = [plugin for plugin in self.plugin_map if plugin.name == plugin_name] + if not plugin: + return + plugin = plugin[0] + last_row = max(self.plugin_map[plugin]) + pmap = QPixmap() + pmap.loadFromData(data) + if pmap.isNull(): + return + self.beginInsertRows(QModelIndex(), last_row, last_row) + for rows in self.plugin_map.itervalues(): + for i in xrange(len(rows)): + if rows[i] >= last_row: + rows[i] += 1 + self.plugin_map[plugin].insert(-1, last_row) + self.covers.insert(last_row, self.get_item(plugin_name, pmap, waiting=False)) + self.endInsertRows() + else: + # single cover plugin + idx = None + for plugin, rows in self.plugin_map.iteritems(): + if plugin.name == plugin_name: + idx = rows[0] + break + if idx is None: + return + pmap = QPixmap() + pmap.loadFromData(data) + if pmap.isNull(): + return + self.covers[idx] = self.get_item(plugin_name, pmap, waiting=False) + self.dataChanged.emit(self.index(idx), self.index(idx)) def cover_pixmap(self, index): row = index.row() @@ -774,9 +807,12 @@ class CoversView(QListView): # {{{ self.m.reset_covers() def clear_failed(self): - plugin = self.m.plugin_for_index(self.currentIndex()) + pointer = self.m.pointer_from_index(self.currentIndex()) self.m.clear_failed() - self.select(self.m.index_for_plugin(plugin).row()) + if pointer is None: + self.select(0) + else: + self.select(self.m.index_from_pointer(pointer).row()) # }}} @@ -852,10 +888,11 @@ class CoversWidget(QWidget): # {{{ if num < 2: txt = _('Could not find any covers for %s')%self.book.title else: - txt = _('Found %(num)d covers of %(title)s. ' - 'Pick the one you like best.')%dict(num=num-1, + txt = _('Found %(num)d possible covers for %(title)s. ' + 'When the download completes, the covers will be sorted by size.')%dict(num=num-1, title=self.title) self.msg.setText(txt) + self.msg.setWordWrap(True) self.finished.emit()