mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-26 08:12:30 -04:00 
			
		
		
		
	[fix] engine: google has changed the layout of its response
Since 28. March google has changed its response, this patch fixes the google engine to scrap out the results & images from the new designed response. closes: https://github.com/searxng/searxng/issues/2287 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									1498202b0b
								
							
						
					
					
						commit
						6f9e678346
					
				| @ -59,13 +59,10 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} | |||||||
| # specific xpath variables | # specific xpath variables | ||||||
| # ------------------------ | # ------------------------ | ||||||
| 
 | 
 | ||||||
| results_xpath = './/div[@data-sokoban-container]' | results_xpath = './/div[contains(@jscontroller, "SC7lYd")]' | ||||||
| title_xpath = './/a/h3[1]' | title_xpath = './/a/h3[1]' | ||||||
| href_xpath = './/a[h3]/@href' | href_xpath = './/a[h3]/@href' | ||||||
| content_xpath = './/div[@data-content-feature]' | content_xpath = './/div[@data-sncf]' | ||||||
| 
 |  | ||||||
| # google *sections* are no usual *results*, we ignore them |  | ||||||
| g_section_with_header = './g-section-with-header' |  | ||||||
| 
 | 
 | ||||||
| # Suggestions are links placed in a *card-section*, we extract only the text | # Suggestions are links placed in a *card-section*, we extract only the text | ||||||
| # from the links not the links itself. | # from the links not the links itself. | ||||||
| @ -303,21 +300,18 @@ def request(query, params): | |||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # (function(){var s='data:image/jpeg;base64,/9j/4AAQ ... | # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA | ||||||
| # ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})(); | # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; | ||||||
| RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc") | RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _parse_data_images(dom): | def _parse_data_images(dom): | ||||||
|     data_image_map = {} |     data_image_map = {} | ||||||
|     for _script in eval_xpath_list(dom, "//script[@nonce]"): |     for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()): | ||||||
|         script = _script.text |         end_pos = data_image.rfind('=') | ||||||
|         if not script: |         if end_pos > 0: | ||||||
|             continue |             data_image = data_image[: end_pos + 1] | ||||||
|         script = RE_DATA_IMAGE.search(script) |         data_image_map[img_id] = data_image | ||||||
|         if not script: |  | ||||||
|             continue |  | ||||||
|         data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=') |  | ||||||
|     logger.debug('data:image objects --> %s', list(data_image_map.keys())) |     logger.debug('data:image objects --> %s', list(data_image_map.keys())) | ||||||
|     return data_image_map |     return data_image_map | ||||||
| 
 | 
 | ||||||
| @ -331,10 +325,6 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 |  | ||||||
|     data_image_map = {} |  | ||||||
|     if '_fmt:html' in UI_ASYNC: |  | ||||||
|         # in this format images are embedded by a bse64 encoded 'data:image' |  | ||||||
|     data_image_map = _parse_data_images(dom) |     data_image_map = _parse_data_images(dom) | ||||||
| 
 | 
 | ||||||
|     # results --> answer |     # results --> answer | ||||||
| @ -349,11 +339,6 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     for result in eval_xpath_list(dom, results_xpath):  # pylint: disable=too-many-nested-blocks |     for result in eval_xpath_list(dom, results_xpath):  # pylint: disable=too-many-nested-blocks | ||||||
| 
 | 
 | ||||||
|         # google *sections* |  | ||||||
|         if extract_text(eval_xpath(result, g_section_with_header)): |  | ||||||
|             logger.debug("ignoring <g-section-with-header>") |  | ||||||
|             continue |  | ||||||
| 
 |  | ||||||
|         try: |         try: | ||||||
|             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) |             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) | ||||||
|             if title_tag is None: |             if title_tag is None: | ||||||
| @ -361,33 +346,29 @@ def response(resp): | |||||||
|                 logger.debug('ignoring item from the result_xpath list: missing title') |                 logger.debug('ignoring item from the result_xpath list: missing title') | ||||||
|                 continue |                 continue | ||||||
|             title = extract_text(title_tag) |             title = extract_text(title_tag) | ||||||
|  | 
 | ||||||
|             url = eval_xpath_getindex(result, href_xpath, 0, None) |             url = eval_xpath_getindex(result, href_xpath, 0, None) | ||||||
|             if url is None: |             if url is None: | ||||||
|  |                 logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title) | ||||||
|                 continue |                 continue | ||||||
| 
 | 
 | ||||||
|             content = [] |             content_nodes = eval_xpath(result, content_xpath) | ||||||
|             img_list = [] |             content = extract_text(content_nodes) | ||||||
|             for content_feature in eval_xpath(result, content_xpath): |  | ||||||
|                 val = content_feature.attrib['data-content-feature'] |  | ||||||
|                 if val in ['1', '2']: |  | ||||||
|                     txt = extract_text(content_feature, allow_none=True) |  | ||||||
|                     if txt: |  | ||||||
|                         content.append(txt) |  | ||||||
|                 elif '0' in val: |  | ||||||
|                     img = content_feature.xpath('.//img/@src') |  | ||||||
|                     if img: |  | ||||||
|                         img = img[0] |  | ||||||
|                         if img.startswith('data:image'): |  | ||||||
|                             img_id = content_feature.xpath('.//img/@id') |  | ||||||
|                             if img_id: |  | ||||||
|                                 img = data_image_map.get(img_id[0]) |  | ||||||
|                         img_list.append(img) |  | ||||||
| 
 | 
 | ||||||
|             if not content: |             if not content: | ||||||
|                 logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) |                 logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) | ||||||
|                 continue |                 continue | ||||||
|             content = ' / '.join(content) | 
 | ||||||
|             img_src = img_list[0] if img_list else None |             img_src = content_nodes[0].xpath('.//img/@src') | ||||||
|  |             if img_src: | ||||||
|  |                 img_src = img_src[0] | ||||||
|  |                 if img_src.startswith('data:image'): | ||||||
|  |                     img_id = content_nodes[0].xpath('.//img/@id') | ||||||
|  |                     if img_id: | ||||||
|  |                         img_src = data_image_map.get(img_id[0]) | ||||||
|  |             else: | ||||||
|  |                 img_src = None | ||||||
|  | 
 | ||||||
|             results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src}) |             results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src}) | ||||||
| 
 | 
 | ||||||
|         except Exception as e:  # pylint: disable=broad-except |         except Exception as e:  # pylint: disable=broad-except | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user