mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Skip multiple URL hits in google search results
This commit is contained in:
parent
9448c42a64
commit
f9fbd402d8
@ -337,6 +337,7 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
|
|||||||
cache_url_map = google_extract_cache_urls(raw)
|
cache_url_map = google_extract_cache_urls(raw)
|
||||||
# print('\n'.join(cache_url_map))
|
# print('\n'.join(cache_url_map))
|
||||||
ans = []
|
ans = []
|
||||||
|
seen = set()
|
||||||
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'):
|
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'):
|
||||||
try:
|
try:
|
||||||
a = div.xpath('descendant::a[@href]')[0]
|
a = div.xpath('descendant::a[@href]')[0]
|
||||||
@ -347,6 +348,9 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
|
|||||||
src_url = a.get('href')
|
src_url = a.get('href')
|
||||||
# print(f'{src_url=}')
|
# print(f'{src_url=}')
|
||||||
curl = canonicalize_url_for_cache_map(src_url)
|
curl = canonicalize_url_for_cache_map(src_url)
|
||||||
|
if curl in seen:
|
||||||
|
continue
|
||||||
|
seen.add(curl)
|
||||||
if curl in cache_url_map:
|
if curl in cache_url_map:
|
||||||
cached_url = cache_url_map[curl]
|
cached_url = cache_url_map[curl]
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user