Metadata download: Add a plugin to download book covers from a google image search. Go to Preferences->Metadata download and enable the plugin to use it. Google Image search often finds larger and/or different covers from the other sources, however, it sometimes finds junk. Use at your discretion.

This commit is contained in:
Kovid Goyal 2013-04-01 22:54:09 +05:30
parent 1663619cef
commit 9ba0272b0c
14 changed files with 263 additions and 60 deletions

View File

@ -757,8 +757,9 @@ from calibre.ebooks.metadata.sources.isbndb import ISBNDB
from calibre.ebooks.metadata.sources.overdrive import OverDrive
from calibre.ebooks.metadata.sources.douban import Douban
from calibre.ebooks.metadata.sources.ozon import Ozon
from calibre.ebooks.metadata.sources.google_images import GoogleImages
plugins += [GoogleBooks, Amazon, Edelweiss, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
plugins += [GoogleBooks, Amazon, Edelweiss, GoogleImages, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
# }}}

View File

@ -91,7 +91,7 @@ def restore_plugin_state_to_default(plugin_or_name):
config['enabled_plugins'] = ep
default_disabled_plugins = set([
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss',
'Overdrive', 'Douban Books', 'OZON.ru', 'Edelweiss', 'Google Images',
])
def is_disabled(plugin):

View File

@ -858,7 +858,7 @@ class Amazon(Source):
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')

View File

@ -31,7 +31,7 @@ msprefs.defaults['find_first_edition_date'] = False
# Google covers are often poor quality (scans/errors) but they have high
# resolution, so they trump covers from better sources. So make sure they
# are only used if no other covers are found.
msprefs.defaults['cover_priorities'] = {'Google':2}
msprefs.defaults['cover_priorities'] = {'Google':2, 'Google Images':2}
def create_log(ostream=None):
from calibre.utils.logging import ThreadSafeLog, FileStream
@ -222,6 +222,9 @@ class Source(Plugin):
#: plugin
config_help_message = None
#: If True this source can return multiple covers for a given query
can_get_multiple_covers = False
def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs)
@ -522,7 +525,7 @@ class Source(Plugin):
return None
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
'''
Download a cover and put it into result_queue. The parameters all have
the same meaning as for :meth:`identify`. Put (self, cover_data) into
@ -531,6 +534,9 @@ class Source(Plugin):
This method should use cached cover URLs for efficiency whenever
possible. When cached data is not present, most plugins simply call
identify and use its results.
If the parameter get_best_cover is True and this plugin can get
multiple covers, it should only get the "best" one.
'''
pass

View File

@ -35,9 +35,14 @@ class Worker(Thread):
start_time = time.time()
if not self.abort.is_set():
try:
self.plugin.download_cover(self.log, self.rq, self.abort,
title=self.title, authors=self.authors,
identifiers=self.identifiers, timeout=self.timeout)
if self.plugin.can_get_multiple_covers:
self.plugin.download_cover(self.log, self.rq, self.abort,
title=self.title, authors=self.authors, get_best_cover=True,
identifiers=self.identifiers, timeout=self.timeout)
else:
self.plugin.download_cover(self.log, self.rq, self.abort,
title=self.title, authors=self.authors,
identifiers=self.identifiers, timeout=self.timeout)
except:
self.log.exception('Failed to download cover from',
self.plugin.name)

View File

@ -221,7 +221,7 @@ class Douban(Source):
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')

View File

@ -320,7 +320,7 @@ class Edelweiss(Source):
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')

View File

@ -209,7 +209,7 @@ class GoogleBooks(Source):
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from collections import OrderedDict
from calibre import as_unicode
from calibre.ebooks.metadata.sources.base import Source, Option
class GoogleImages(Source):
name = 'Google Images'
description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')
capabilities = frozenset(['cover'])
config_help_message = _('Configure the Google Image Search plugin')
can_get_multiple_covers = True
options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
_('The maximum number of covers to process from the google search result')),
Option('size', 'choices', 'svga', _('Cover size'),
_('Search for covers larger than the specified size'),
choices=OrderedDict((
('any', _('Any size'),),
('l', _('Large'),),
('qsvga', _('Larger than %s')%'400x300',),
('vga', _('Larger than %s')%'640x480',),
('svga', _('Larger than %s')%'600x800',),
('xga', _('Larger than %s')%'1024x768',),
('2mp', _('Larger than %s')%'2 MP',),
('4mp', _('Larger than %s')%'4 MP',),
))),
)
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
if not title:
return
from threading import Thread
import time
timeout = max(60, timeout) # Needs at least a minute
title = ' '.join(self.get_title_tokens(title))
author = ' '.join(self.get_author_tokens(authors))
urls = self.get_image_urls(title, author, log, abort, timeout)
if not urls:
log('No images found in Google for, title: %r and authors: %r'%(title, author))
return
urls = urls[:self.prefs['max_covers']]
if get_best_cover:
urls = urls[:1]
workers = [Thread(target=self.download_image, args=(url, timeout, log, result_queue)) for url in urls]
for w in workers:
w.daemon = True
w.start()
alive = True
start_time = time.time()
while alive and not abort.is_set() and time.time() - start_time < timeout:
alive = False
for w in workers:
if w.is_alive():
alive = True
break
abort.wait(0.1)
def download_image(self, url, timeout, log, result_queue):
try:
ans = self.browser.open_novisit(url, timeout=timeout).read()
result_queue.put((self, ans))
log('Downloaded cover from: %s'%url)
except Exception:
self.log.exception('Failed to download cover from: %r'%url)
def get_image_urls(self, title, author, log, abort, timeout):
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
try:
return fork_job('calibre.ebooks.metadata.sources.google_images',
'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result']
except WorkerError as e:
if e.orig_tb:
log.error(e.orig_tb)
log.exception('Searching google failed:' + as_unicode(e))
except Exception as e:
log.exception('Searching google failed:' + as_unicode(e))
return []
USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Firefox/3.6.13'
def find_image_urls(br, ans):
import urlparse
for w in br.page.mainFrame().documentElement().findAll('.images_table a[href]'):
try:
imgurl = urlparse.parse_qs(urlparse.urlparse(unicode(w.attribute('href'))).query)['imgurl'][0]
except:
continue
if imgurl not in ans:
ans.append(imgurl)
def search(title, author, size, timeout, debug=False):
import time
from calibre.web.jsbrowser.browser import Browser, LoadWatcher, Timeout
ans = []
start_time = time.time()
br = Browser(user_agent=USER_AGENT, enable_developer_tools=debug)
br.visit('https://www.google.com/advanced_image_search')
f = br.select_form('form[action="/search"]')
f['as_q'] = '%s %s'%(title, author)
if size != 'any':
f['imgsz'] = size
f['imgar'] = 't|xt'
f['as_filetype'] = 'jpg'
br.submit(wait_for_load=False)
# Loop until the page finishes loading or at least five image urls are
# found
lw = LoadWatcher(br.page, br)
while lw.is_loading and len(ans) < 5:
br.run_for_a_time(0.2)
find_image_urls(br, ans)
if time.time() - start_time > timeout:
raise Timeout('Timed out trying to load google image search page')
find_image_urls(br, ans)
if debug:
br.show_browser()
br.close()
del br # Needed to prevent PyQt from segfaulting
return ans
def test_google():
import pprint
pprint.pprint(search('heroes', 'abercrombie', 'svga', 60, debug=True))
def test():
from Queue import Queue
from threading import Event
from calibre.utils.logging import default_log
p = GoogleImages(None)
rq = Queue()
p.download_cover(default_log, rq, Event(), title='The Heroes',
authors=('Joe Abercrombie',))
print ('Downloaded', rq.qsize(), 'covers')
if __name__ == '__main__':
test()

View File

@ -19,7 +19,7 @@ class OpenLibrary(Source):
OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
if 'isbn' not in identifiers:
return
isbn = identifiers['isbn']

View File

@ -75,7 +75,7 @@ class OverDrive(Source):
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
import mechanize
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:

View File

@ -55,7 +55,7 @@ class Ozon(Source):
# for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None))
ozonid = identifiers.get('ozon', None)
unk = unicode(_('Unknown')).upper()
if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
qItems = set([isbn, title])
@ -64,19 +64,19 @@ class Ozon(Source):
qItems.discard(None)
qItems.discard('')
qItems = map(_quoteString, qItems)
q = u' '.join(qItems).strip()
log.info(u'search string: ' + q)
if isinstance(q, unicode):
q = q.encode('utf-8')
if not q:
return None
search_url += quote_plus(q)
else:
search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
log.debug(u'search url: %r'%search_url)
return search_url
# }}}
@ -250,7 +250,7 @@ class Ozon(Source):
return url
# }}}
def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # {{{
def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.debug('No cached cover found, running identify')

View File

@ -11,6 +11,7 @@ import os
from threading import Event, Thread
from Queue import Queue, Empty
from io import BytesIO
from collections import Counter
from calibre.utils.date import as_utc
from calibre.ebooks.metadata.sources.identify import identify, msprefs
@ -113,13 +114,18 @@ def single_covers(title, authors, identifiers, caches, tdir):
kwargs=dict(title=title, authors=authors, identifiers=identifiers))
worker.daemon = True
worker.start()
c = Counter()
while worker.is_alive():
try:
plugin, width, height, fmt, data = results.get(True, 1)
except Empty:
continue
else:
name = '%s,,%s,,%s,,%s.cover'%(plugin.name, width, height, fmt)
name = plugin.name
if plugin.can_get_multiple_covers:
name += '{%d}'%c[plugin.name]
c[plugin.name] += 1
name = '%s,,%s,,%s,,%s.cover'%(name, width, height, fmt)
with open(name, 'wb') as f:
f.write(data)
os.mkdir(name+'.done')

View File

@ -16,13 +16,12 @@ from operator import attrgetter
from Queue import Queue, Empty
from io import BytesIO
from PyQt4.Qt import (QStyledItemDelegate, QTextDocument, QRectF, QIcon, Qt,
QApplication, QDialog, QVBoxLayout, QLabel,
QDialogButtonBox, QStyle, QStackedWidget, QWidget,
QTableView, QGridLayout, QFontInfo, QPalette, QTimer,
pyqtSignal, QAbstractTableModel, QVariant, QSize,
QListView, QPixmap, QAbstractListModel, QColor, QRect,
QTextBrowser, QStringListModel)
from PyQt4.Qt import (
QStyledItemDelegate, QTextDocument, QRectF, QIcon, Qt, QApplication,
QDialog, QVBoxLayout, QLabel, QDialogButtonBox, QStyle, QStackedWidget,
QWidget, QTableView, QGridLayout, QFontInfo, QPalette, QTimer, pyqtSignal,
QAbstractTableModel, QVariant, QSize, QListView, QPixmap, QModelIndex,
QAbstractListModel, QColor, QRect, QTextBrowser, QStringListModel)
from PyQt4.QtWebKit import QWebView
from calibre.customize.ui import metadata_plugins
@ -654,7 +653,7 @@ class CoversModel(QAbstractListModel): # {{{
for i, plugin in enumerate(metadata_plugins(['cover'])):
self.covers.append((plugin.name+'\n'+_('Searching...'),
QVariant(self.blank), None, True))
self.plugin_map[plugin] = i+1
self.plugin_map[plugin] = [i+1]
if do_reset:
self.reset()
@ -685,48 +684,82 @@ class CoversModel(QAbstractListModel): # {{{
def plugin_for_index(self, index):
row = index.row() if hasattr(index, 'row') else index
for k, v in self.plugin_map.iteritems():
if v == row:
if row in v:
return k
def cover_keygen(self, x):
pmap = x[2]
if pmap is None:
return 1
return pmap.width()*pmap.height()
def clear_failed(self):
# Remove entries that are still waiting
good = []
pmap = {}
dcovers = sorted(self.covers[1:], key=self.cover_keygen, reverse=True)
cmap = {x:self.covers.index(x) for x in self.covers}
def keygen(x):
pmap = x[2]
if pmap is None:
return 1
return pmap.width()*pmap.height()
dcovers = sorted(self.covers[1:], key=keygen, reverse=True)
cmap = {i:self.plugin_for_index(i) for i in xrange(len(self.covers))}
for i, x in enumerate(self.covers[0:1] + dcovers):
if not x[-1]:
good.append(x)
if i > 0:
plugin = self.plugin_for_index(cmap[x])
pmap[plugin] = len(good) - 1
plugin = cmap[i]
if plugin is not None:
try:
pmap[plugin].append(len(good) - 1)
except KeyError:
pmap[plugin] = [len(good)-1]
self.covers = good
self.plugin_map = pmap
self.reset()
def index_for_plugin(self, plugin):
idx = self.plugin_map.get(plugin, 0)
return self.index(idx)
def pointer_from_index(self, index):
row = index.row() if hasattr(index, 'row') else index
try:
return self.covers[row][2]
except IndexError:
pass
def index_from_pointer(self, pointer):
for r, (text, scaled, pmap, waiting) in enumerate(self.covers):
if pointer == pmap:
return self.index(r)
return self.index(0)
def update_result(self, plugin_name, width, height, data):
idx = None
for plugin, i in self.plugin_map.iteritems():
if plugin.name == plugin_name:
idx = i
break
if idx is None:
return
pmap = QPixmap()
pmap.loadFromData(data)
if pmap.isNull():
return
self.covers[idx] = self.get_item(plugin_name, pmap, waiting=False)
self.dataChanged.emit(self.index(idx), self.index(idx))
if plugin_name.endswith('}'):
# multi cover plugin
plugin_name = plugin_name.partition('{')[0]
plugin = [plugin for plugin in self.plugin_map if plugin.name == plugin_name]
if not plugin:
return
plugin = plugin[0]
last_row = max(self.plugin_map[plugin])
pmap = QPixmap()
pmap.loadFromData(data)
if pmap.isNull():
return
self.beginInsertRows(QModelIndex(), last_row, last_row)
for rows in self.plugin_map.itervalues():
for i in xrange(len(rows)):
if rows[i] >= last_row:
rows[i] += 1
self.plugin_map[plugin].insert(-1, last_row)
self.covers.insert(last_row, self.get_item(plugin_name, pmap, waiting=False))
self.endInsertRows()
else:
# single cover plugin
idx = None
for plugin, rows in self.plugin_map.iteritems():
if plugin.name == plugin_name:
idx = rows[0]
break
if idx is None:
return
pmap = QPixmap()
pmap.loadFromData(data)
if pmap.isNull():
return
self.covers[idx] = self.get_item(plugin_name, pmap, waiting=False)
self.dataChanged.emit(self.index(idx), self.index(idx))
def cover_pixmap(self, index):
row = index.row()
@ -774,9 +807,12 @@ class CoversView(QListView): # {{{
self.m.reset_covers()
def clear_failed(self):
plugin = self.m.plugin_for_index(self.currentIndex())
pointer = self.m.pointer_from_index(self.currentIndex())
self.m.clear_failed()
self.select(self.m.index_for_plugin(plugin).row())
if pointer is None:
self.select(0)
else:
self.select(self.m.index_from_pointer(pointer).row())
# }}}
@ -852,10 +888,11 @@ class CoversWidget(QWidget): # {{{
if num < 2:
txt = _('Could not find any covers for <b>%s</b>')%self.book.title
else:
txt = _('Found <b>%(num)d</b> covers of %(title)s. '
'Pick the one you like best.')%dict(num=num-1,
txt = _('Found <b>%(num)d</b> possible covers for %(title)s. '
'When the download completes, the covers will be sorted by size.')%dict(num=num-1,
title=self.title)
self.msg.setText(txt)
self.msg.setWordWrap(True)
self.finished.emit()