From 7bc0399bdae9a0d0ba8d1d88997603570d1b26d1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 31 Jul 2022 11:57:44 +0530
Subject: [PATCH] Allow directly querying the google web cache as well

---
 .../ebooks/metadata/sources/search_engines.py | 44 +++++++++++++++++--
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py
index 41ca1bea2d..5ae3c8aaf4 100644
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@@ -9,12 +9,13 @@ import re
 import time
 from collections import namedtuple
 from contextlib import contextmanager
+from threading import Lock
 
 try:
-    from urllib.parse import parse_qs, quote_plus, unquote, urlencode
+    from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote
 except ImportError:
     from urlparse import parse_qs
-    from urllib import quote_plus, urlencode, unquote
+    from urllib import quote_plus, urlencode, unquote, quote
 
 from lxml import etree
 
@@ -24,8 +25,10 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.lock import ExclusiveFile
 from calibre.utils.random_ua import accept_header_for_ua
 
-current_version = (1, 0, 18)
+current_version = (1, 1, 0)
 minimum_calibre_version = (2, 80, 0)
+webcache = {}
+webcache_lock = Lock()
 
 
 Result = namedtuple('Result', 'url title cached_url')
@@ -106,6 +109,11 @@ def quote_term(x):
 
 # DDG + Wayback machine {{{
 
+
+def ddg_url_processor(url):
+    return url
+
+
 def ddg_term(t):
     t = t.replace('"', '')
     if t.lower() in {'map', 'news'}:
@@ -172,7 +180,7 @@ def ddg_develop():
         if '/dp/' in result.url:
             print(result.title)
             print(' ', result.url)
-            print(' ', wayback_machine_cached_url(result.url, br))
+            print(' ', get_cached_url(result.url, br))
             print()
 # }}}
 
@@ -252,6 +260,25 @@ def google_url_processor(url):
     return url
 
 
+def google_get_cached_url(url, br=None, log=prints, timeout=60):
+    ourl = url
+    if not isinstance(url, bytes):
+        url = url.encode('utf-8')
+    cu = quote(url, safe='')
+    if isinstance(cu, bytes):
+        cu = cu.decode('utf-8')
+    cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
+    br = google_specialize_browser(br or browser())
+    try:
+        raw = query(br, cached_url, 'google-cache', parser=lambda x: x, timeout=timeout)
+    except Exception as err:
+        log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
+    else:
+        with webcache_lock:
+            webcache[cached_url] = raw
+        return cached_url
+
+
 def google_extract_cache_urls(raw):
     if isinstance(raw, bytes):
         raw = raw.decode('utf-8', 'replace')
@@ -351,6 +378,15 @@ def google_develop(search_terms='1423146786', raw_from=''):
 # }}}
 
 
+def get_cached_url(url, br=None, log=prints, timeout=60):
+    return google_get_cached_url(url, br, log, timeout) or wayback_machine_cached_url(url, br, log, timeout)
+
+
+def get_data_for_cached_url(url):
+    with webcache_lock:
+        return webcache.get(url)
+
+
 def resolve_url(url):
     prefix, rest = url.partition(':')[::2]
     if prefix == 'bing':