From 7bc0399bdae9a0d0ba8d1d88997603570d1b26d1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Jul 2022 11:57:44 +0530 Subject: [PATCH] Allow directly querying the google web cache as well --- .../ebooks/metadata/sources/search_engines.py | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index 41ca1bea2d..5ae3c8aaf4 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -9,12 +9,13 @@ import re import time from collections import namedtuple from contextlib import contextmanager +from threading import Lock try: - from urllib.parse import parse_qs, quote_plus, unquote, urlencode + from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote except ImportError: from urlparse import parse_qs - from urllib import quote_plus, urlencode, unquote + from urllib import quote_plus, urlencode, unquote, quote from lxml import etree @@ -24,8 +25,10 @@ from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.lock import ExclusiveFile from calibre.utils.random_ua import accept_header_for_ua -current_version = (1, 0, 18) +current_version = (1, 1, 0) minimum_calibre_version = (2, 80, 0) +webcache = {} +webcache_lock = Lock() Result = namedtuple('Result', 'url title cached_url') @@ -106,6 +109,11 @@ def quote_term(x): # DDG + Wayback machine {{{ + +def ddg_url_processor(url): + return url + + def ddg_term(t): t = t.replace('"', '') if t.lower() in {'map', 'news'}: @@ -172,7 +180,7 @@ def ddg_develop(): if '/dp/' in result.url: print(result.title) print(' ', result.url) - print(' ', wayback_machine_cached_url(result.url, br)) + print(' ', get_cached_url(result.url, br)) print() # }}} @@ -252,6 +260,25 @@ def google_url_processor(url): return url +def google_get_cached_url(url, br=None, log=prints, timeout=60): + ourl = url + if not isinstance(url, bytes): + url = url.encode('utf-8') + cu = quote(url, safe='') + if isinstance(cu, bytes): + cu = cu.decode('utf-8') + cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu + br = google_specialize_browser(br or browser()) + try: + raw = query(br, cached_url, 'google-cache', parser=lambda x: x, timeout=timeout) + except Exception as err: + log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err)) + else: + with webcache_lock: + webcache[cached_url] = raw + return cached_url + + def google_extract_cache_urls(raw): if isinstance(raw, bytes): raw = raw.decode('utf-8', 'replace') @@ -351,6 +378,15 @@ def google_develop(search_terms='1423146786', raw_from=''): # }}} +def get_cached_url(url, br=None, log=prints, timeout=60): + return google_get_cached_url(url, br, log, timeout) or wayback_machine_cached_url(url, br, log, timeout) + + +def get_data_for_cached_url(url): + with webcache_lock: + return webcache.get(url) + + def resolve_url(url): prefix, rest = url.partition(':')[::2] if prefix == 'bing':