Skip multiple URL hits in google search results

This commit is contained in:
Kovid Goyal 2023-12-29 12:12:12 +05:30
parent 9448c42a64
commit f9fbd402d8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -337,6 +337,7 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
cache_url_map = google_extract_cache_urls(raw) cache_url_map = google_extract_cache_urls(raw)
# print('\n'.join(cache_url_map)) # print('\n'.join(cache_url_map))
ans = [] ans = []
seen = set()
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'): for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'):
try: try:
a = div.xpath('descendant::a[@href]')[0] a = div.xpath('descendant::a[@href]')[0]
@ -347,6 +348,9 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
src_url = a.get('href') src_url = a.get('href')
# print(f'{src_url=}') # print(f'{src_url=}')
curl = canonicalize_url_for_cache_map(src_url) curl = canonicalize_url_for_cache_map(src_url)
if curl in seen:
continue
seen.add(curl)
if curl in cache_url_map: if curl in cache_url_map:
cached_url = cache_url_map[curl] cached_url = cache_url_map[curl]
else: else: