mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Basic yandex code
Useless since yandex uses CAPTCHAs
This commit is contained in:
parent
3fd1bc2df7
commit
be586a4b69
@ -433,6 +433,57 @@ def google_develop(search_terms='1423146786', raw_from=''):
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
|
# Yandex {{{
|
||||||
|
def yandex_term(t):
|
||||||
|
t = t.replace('"', '')
|
||||||
|
if t in {'OR', 'AND', 'NOT'}:
|
||||||
|
t = t.lower()
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def yandex_format_query(terms, site=None):
|
||||||
|
terms = [quote_term(yandex_term(t)) for t in terms]
|
||||||
|
if site is not None:
|
||||||
|
terms.append(quote_term(('site:' + site)))
|
||||||
|
q = '+'.join(terms)
|
||||||
|
url = 'https://yandex.com/search?text={q}'.format(q=q)
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def yandex_parse_results(root, raw, log=prints, ignore_uncached=True):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
yandex_scraper_storage = []
|
||||||
|
|
||||||
|
|
||||||
|
def yandex_search(terms, site=None, br=None, dump_raw=None, log=prints, timeout=60):
|
||||||
|
# Sadly yandex uses CAPTCHAs aggresively
|
||||||
|
url = yandex_format_query(terms, site)
|
||||||
|
br = browser()
|
||||||
|
r = []
|
||||||
|
from calibre.scraper.simple import read_url
|
||||||
|
root = query(br, url, 'yandex', dump_raw, timeout=timeout, save_raw=r.append, simple_scraper=partial(read_url, yandex_scraper_storage))
|
||||||
|
return yandex_parse_results(root, r[0], log=log), url
|
||||||
|
|
||||||
|
|
||||||
|
def yandex_develop(search_terms='1423146786', raw_from=''):
|
||||||
|
if raw_from:
|
||||||
|
with open(raw_from, 'rb') as f:
|
||||||
|
raw = f.read()
|
||||||
|
results = yandex_parse_results(parse_html(raw), raw)
|
||||||
|
else:
|
||||||
|
results = yandex_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html')[0]
|
||||||
|
for result in results:
|
||||||
|
if '/dp/' in result.url:
|
||||||
|
print(result.title)
|
||||||
|
print(' ', result.url)
|
||||||
|
print(' ', result.cached_url)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
def get_cached_url(url, br=None, log=prints, timeout=60):
|
def get_cached_url(url, br=None, log=prints, timeout=60):
|
||||||
from threading import Lock, Thread
|
from threading import Lock, Thread
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user