mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Allow directly querying the google web cache as well
This commit is contained in:
parent
0f764be26f
commit
7bc0399bda
@ -9,12 +9,13 @@ import re
|
||||
import time
|
||||
from collections import namedtuple
|
||||
from contextlib import contextmanager
|
||||
from threading import Lock
|
||||
|
||||
try:
|
||||
from urllib.parse import parse_qs, quote_plus, unquote, urlencode
|
||||
from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote
|
||||
except ImportError:
|
||||
from urlparse import parse_qs
|
||||
from urllib import quote_plus, urlencode, unquote
|
||||
from urllib import quote_plus, urlencode, unquote, quote
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@ -24,8 +25,10 @@ from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.lock import ExclusiveFile
|
||||
from calibre.utils.random_ua import accept_header_for_ua
|
||||
|
||||
current_version = (1, 0, 18)
|
||||
current_version = (1, 1, 0)
|
||||
minimum_calibre_version = (2, 80, 0)
|
||||
webcache = {}
|
||||
webcache_lock = Lock()
|
||||
|
||||
|
||||
Result = namedtuple('Result', 'url title cached_url')
|
||||
@ -106,6 +109,11 @@ def quote_term(x):
|
||||
|
||||
# DDG + Wayback machine {{{
|
||||
|
||||
|
||||
def ddg_url_processor(url):
|
||||
return url
|
||||
|
||||
|
||||
def ddg_term(t):
|
||||
t = t.replace('"', '')
|
||||
if t.lower() in {'map', 'news'}:
|
||||
@ -172,7 +180,7 @@ def ddg_develop():
|
||||
if '/dp/' in result.url:
|
||||
print(result.title)
|
||||
print(' ', result.url)
|
||||
print(' ', wayback_machine_cached_url(result.url, br))
|
||||
print(' ', get_cached_url(result.url, br))
|
||||
print()
|
||||
# }}}
|
||||
|
||||
@ -252,6 +260,25 @@ def google_url_processor(url):
|
||||
return url
|
||||
|
||||
|
||||
def google_get_cached_url(url, br=None, log=prints, timeout=60):
|
||||
ourl = url
|
||||
if not isinstance(url, bytes):
|
||||
url = url.encode('utf-8')
|
||||
cu = quote(url, safe='')
|
||||
if isinstance(cu, bytes):
|
||||
cu = cu.decode('utf-8')
|
||||
cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
|
||||
br = google_specialize_browser(br or browser())
|
||||
try:
|
||||
raw = query(br, cached_url, 'google-cache', parser=lambda x: x, timeout=timeout)
|
||||
except Exception as err:
|
||||
log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
|
||||
else:
|
||||
with webcache_lock:
|
||||
webcache[cached_url] = raw
|
||||
return cached_url
|
||||
|
||||
|
||||
def google_extract_cache_urls(raw):
|
||||
if isinstance(raw, bytes):
|
||||
raw = raw.decode('utf-8', 'replace')
|
||||
@ -351,6 +378,15 @@ def google_develop(search_terms='1423146786', raw_from=''):
|
||||
# }}}
|
||||
|
||||
|
||||
def get_cached_url(url, br=None, log=prints, timeout=60):
|
||||
return google_get_cached_url(url, br, log, timeout) or wayback_machine_cached_url(url, br, log, timeout)
|
||||
|
||||
|
||||
def get_data_for_cached_url(url):
|
||||
with webcache_lock:
|
||||
return webcache.get(url)
|
||||
|
||||
|
||||
def resolve_url(url):
|
||||
prefix, rest = url.partition(':')[::2]
|
||||
if prefix == 'bing':
|
||||
|
Loading…
x
Reference in New Issue
Block a user