Replace use of obsolete Qt WebKit in the google images metadata source plugin

2025-07-09 03:04:10 -04:00 · 2016-04-24 11:04:25 +05:30 · 2016-04-24 11:04:25 +05:30 · aaf9454891
commit aaf9454891
parent 30b1db98b0
1 changed files with 34 additions and 61 deletions
--- a/src/calibre/ebooks/metadata/sources/google_images.py
+++ b/src/calibre/ebooks/metadata/sources/google_images.py
@ -9,9 +9,10 @@ __docformat__ = 'restructuredtext en'

 from collections import OrderedDict

-from calibre import as_unicode
 from calibre.ebooks.metadata.sources.base import Source, Option

+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
+
 class GoogleImages(Source):

    name = 'Google Images'
@ -19,6 +20,7 @@ class GoogleImages(Source):
    capabilities = frozenset(['cover'])
    config_help_message = _('Configure the Google Image Search plugin')
    can_get_multiple_covers = True
+    supports_gzip_transfer_encoding = True
    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
                      _('The maximum number of covers to process from the google search result')),
               Option('size', 'choices', 'svga', _('Cover size'),
@ -45,66 +47,38 @@ class GoogleImages(Source):
        urls = self.get_image_urls(title, author, log, abort, timeout)
        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)

+    @property
+    def user_agent(self):
+        return USER_AGENT
+
    def get_image_urls(self, title, author, log, abort, timeout):
-        from calibre.utils.ipc.simple_worker import fork_job, WorkerError
-        try:
-            return fork_job('calibre.ebooks.metadata.sources.google_images',
-                    'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result']
-        except WorkerError as e:
-            if e.orig_tb:
-                log.error(e.orig_tb)
-            log.exception('Searching google failed:' + as_unicode(e))
-        except Exception as e:
-            log.exception('Searching google failed:' + as_unicode(e))
-
-        return []
-
-USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
-
-def find_image_urls(br, ans):
-    import urlparse
-    for w in br.page.mainFrame().documentElement().findAll('div#ires a.rg_l[href]'):
-        try:
-            imgurl = urlparse.parse_qs(urlparse.urlparse(unicode(w.attribute('href'))).query)['imgurl'][0]
-        except:
-            # import traceback
-            # traceback.print_exc()
-            continue
-        if imgurl not in ans:
-            ans.append(imgurl)
-
-def search(title, author, size, timeout, debug=False):
-    import time
-    from calibre.web.jsbrowser.browser import Browser, Timeout
-    ans = []
-    start_time = time.time()
-    br = Browser(user_agent=USER_AGENT, enable_developer_tools=debug, headless=not debug)
-    br.visit('https://www.google.com/advanced_image_search')
-    f = br.select_form('form[action="/search"]')
-    f['as_q'] = '%s %s'%(title, author)
-    if size != 'any':
-        f['imgsz'] = size
-    f['imgar'] = 't|xt'
-    f['as_filetype'] = 'jpg'
-    br.submit(wait_for_load=False)
-
-    # Loop until the page finishes loading or at least five image urls are
-    # found
-    while len(ans) < 5:
-        br.run_for_a_time(0.2)
-        find_image_urls(br, ans)
-        if time.time() - start_time > timeout:
-            raise Timeout('Timed out trying to load google image search page')
-    find_image_urls(br, ans)
-    if debug:
-        br.show_browser()
-    br.close()
-    del br  # Needed to prevent PyQt from segfaulting
-    return ans
-
-def test_google():
-    import pprint
-    pprint.pprint(search('heroes', 'abercrombie', 'svga', 60, debug=True))
+        from calibre.utils.cleantext import clean_ascii_chars
+        from urllib import quote_plus
+        import html5lib
+        import json
+        from collections import OrderedDict
+        ans = OrderedDict()
+        br = self.browser
+        q = quote_plus('%s %s'%(title, author))
+        sz = self.prefs['size']
+        if sz == 'any':
+            sz = ''
+        elif sz == 'l':
+            sz = 'isz:l,'
+        else:
+            sz = 'isz:lt,islt:%s,' % sz
+        url = 'https://www.google.com/search?as_st=y&tbm=isch&as_q={}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
+        log('Search URL: ' + url)
+        raw = br.open(url).read()
+        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False)
+        for div in root.xpath('//div[@class="rg_meta"]'):
+            try:
+                data = json.loads(div.text)
+            except Exception:
+                continue
+            if 'ou' in data:
+                ans[data['ou']] = True
+        return list(ans.iterkeys())

 def test():
    from Queue import Queue
@ -119,4 +93,3 @@ def test():

 if __name__ == '__main__':
    test()
-