mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Replace use of obsolete Qt WebKit in the google images metadata source plugin
This commit is contained in:
parent
30b1db98b0
commit
aaf9454891
@ -9,9 +9,10 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from calibre import as_unicode
|
|
||||||
from calibre.ebooks.metadata.sources.base import Source, Option
|
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||||
|
|
||||||
|
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
|
||||||
|
|
||||||
class GoogleImages(Source):
|
class GoogleImages(Source):
|
||||||
|
|
||||||
name = 'Google Images'
|
name = 'Google Images'
|
||||||
@ -19,6 +20,7 @@ class GoogleImages(Source):
|
|||||||
capabilities = frozenset(['cover'])
|
capabilities = frozenset(['cover'])
|
||||||
config_help_message = _('Configure the Google Image Search plugin')
|
config_help_message = _('Configure the Google Image Search plugin')
|
||||||
can_get_multiple_covers = True
|
can_get_multiple_covers = True
|
||||||
|
supports_gzip_transfer_encoding = True
|
||||||
options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
|
options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
|
||||||
_('The maximum number of covers to process from the google search result')),
|
_('The maximum number of covers to process from the google search result')),
|
||||||
Option('size', 'choices', 'svga', _('Cover size'),
|
Option('size', 'choices', 'svga', _('Cover size'),
|
||||||
@ -45,66 +47,38 @@ class GoogleImages(Source):
|
|||||||
urls = self.get_image_urls(title, author, log, abort, timeout)
|
urls = self.get_image_urls(title, author, log, abort, timeout)
|
||||||
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def user_agent(self):
|
||||||
|
return USER_AGENT
|
||||||
|
|
||||||
def get_image_urls(self, title, author, log, abort, timeout):
|
def get_image_urls(self, title, author, log, abort, timeout):
|
||||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
from urllib import quote_plus
|
||||||
|
import html5lib
|
||||||
|
import json
|
||||||
|
from collections import OrderedDict
|
||||||
|
ans = OrderedDict()
|
||||||
|
br = self.browser
|
||||||
|
q = quote_plus('%s %s'%(title, author))
|
||||||
|
sz = self.prefs['size']
|
||||||
|
if sz == 'any':
|
||||||
|
sz = ''
|
||||||
|
elif sz == 'l':
|
||||||
|
sz = 'isz:l,'
|
||||||
|
else:
|
||||||
|
sz = 'isz:lt,islt:%s,' % sz
|
||||||
|
url = 'https://www.google.com/search?as_st=y&tbm=isch&as_q={}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
|
||||||
|
log('Search URL: ' + url)
|
||||||
|
raw = br.open(url).read()
|
||||||
|
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
for div in root.xpath('//div[@class="rg_meta"]'):
|
||||||
try:
|
try:
|
||||||
return fork_job('calibre.ebooks.metadata.sources.google_images',
|
data = json.loads(div.text)
|
||||||
'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result']
|
except Exception:
|
||||||
except WorkerError as e:
|
|
||||||
if e.orig_tb:
|
|
||||||
log.error(e.orig_tb)
|
|
||||||
log.exception('Searching google failed:' + as_unicode(e))
|
|
||||||
except Exception as e:
|
|
||||||
log.exception('Searching google failed:' + as_unicode(e))
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
|
|
||||||
|
|
||||||
def find_image_urls(br, ans):
|
|
||||||
import urlparse
|
|
||||||
for w in br.page.mainFrame().documentElement().findAll('div#ires a.rg_l[href]'):
|
|
||||||
try:
|
|
||||||
imgurl = urlparse.parse_qs(urlparse.urlparse(unicode(w.attribute('href'))).query)['imgurl'][0]
|
|
||||||
except:
|
|
||||||
# import traceback
|
|
||||||
# traceback.print_exc()
|
|
||||||
continue
|
continue
|
||||||
if imgurl not in ans:
|
if 'ou' in data:
|
||||||
ans.append(imgurl)
|
ans[data['ou']] = True
|
||||||
|
return list(ans.iterkeys())
|
||||||
def search(title, author, size, timeout, debug=False):
|
|
||||||
import time
|
|
||||||
from calibre.web.jsbrowser.browser import Browser, Timeout
|
|
||||||
ans = []
|
|
||||||
start_time = time.time()
|
|
||||||
br = Browser(user_agent=USER_AGENT, enable_developer_tools=debug, headless=not debug)
|
|
||||||
br.visit('https://www.google.com/advanced_image_search')
|
|
||||||
f = br.select_form('form[action="/search"]')
|
|
||||||
f['as_q'] = '%s %s'%(title, author)
|
|
||||||
if size != 'any':
|
|
||||||
f['imgsz'] = size
|
|
||||||
f['imgar'] = 't|xt'
|
|
||||||
f['as_filetype'] = 'jpg'
|
|
||||||
br.submit(wait_for_load=False)
|
|
||||||
|
|
||||||
# Loop until the page finishes loading or at least five image urls are
|
|
||||||
# found
|
|
||||||
while len(ans) < 5:
|
|
||||||
br.run_for_a_time(0.2)
|
|
||||||
find_image_urls(br, ans)
|
|
||||||
if time.time() - start_time > timeout:
|
|
||||||
raise Timeout('Timed out trying to load google image search page')
|
|
||||||
find_image_urls(br, ans)
|
|
||||||
if debug:
|
|
||||||
br.show_browser()
|
|
||||||
br.close()
|
|
||||||
del br # Needed to prevent PyQt from segfaulting
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def test_google():
|
|
||||||
import pprint
|
|
||||||
pprint.pprint(search('heroes', 'abercrombie', 'svga', 60, debug=True))
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
from Queue import Queue
|
from Queue import Queue
|
||||||
@ -119,4 +93,3 @@ def test():
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test()
|
test()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user