Handle change of markup on google search pages

This commit is contained in:
Kovid Goyal 2018-10-11 19:06:24 +05:30
parent 2529c2104c
commit c477444367
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -17,7 +17,7 @@ from calibre import browser as _browser, prints, random_user_agent
from calibre.utils.monotonic import monotonic from calibre.utils.monotonic import monotonic
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 0, 1) current_version = (1, 0, 2)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
@ -230,13 +230,13 @@ def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump
ans = [] ans = []
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'): for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'):
try: try:
a = div.xpath('descendant::h3[@class="r"]/a[@href]')[0] a = div.xpath('descendant::div[@class="r"]/a[@href]')[0]
except IndexError: except IndexError:
log('Ignoring div with no descendant') log('Ignoring div with no descendant')
continue continue
title = tostring(a) title = tostring(a)
try: try:
c = div.xpath('descendant::div[@class="s"]//a[@class="fl"]')[0] c = div.xpath('descendant::div[@role="menu"]//a[@class="fl"]')[0]
except IndexError: except IndexError:
log('Ignoring {!r} as it has no cached page'.format(title)) log('Ignoring {!r} as it has no cached page'.format(title))
continue continue
@ -248,9 +248,9 @@ def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump
return ans, url return ans, url
def google_develop(): def google_develop(search_terms='1423146786'):
br = browser() br = browser()
for result in google_search('1423146786'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]: for result in google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:
if '/dp/' in result.url: if '/dp/' in result.url:
print(result.title) print(result.title)
print(' ', result.url) print(' ', result.url)