mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 10:37:06 -04:00 
			
		
		
		
	[fix] engine bing-news: replace the http:// by https://
BTW: add bing_news to the pylint process Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									c351993fdc
								
							
						
					
					
						commit
						f41734a543
					
				| @ -1,16 +1,27 @@ | ||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| """ | ||||
|  Bing (News) | ||||
| # lint: pylint | ||||
| """Bing (News) | ||||
| """ | ||||
| 
 | ||||
| from urllib.parse import ( | ||||
|     urlencode, | ||||
|     urlparse, | ||||
|     parse_qsl, | ||||
|     quote, | ||||
| ) | ||||
| from datetime import datetime | ||||
| from dateutil import parser | ||||
| from urllib.parse import urlencode, urlparse, parse_qsl | ||||
| from lxml import etree | ||||
| from lxml.etree import XPath | ||||
| from searx.utils import match_language, eval_xpath_getindex | ||||
| from searx.engines.bing import language_aliases | ||||
| from searx.engines.bing import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import | ||||
| from searx.utils import ( | ||||
|     match_language, | ||||
|     eval_xpath_getindex | ||||
| ) | ||||
| from searx.engines.bing import (  # pylint: disable=unused-import | ||||
|     language_aliases, | ||||
|     _fetch_supported_languages, | ||||
|     supported_languages_url, | ||||
| ) | ||||
| 
 | ||||
| # about | ||||
| about = { | ||||
| @ -31,69 +42,71 @@ time_range_support = True | ||||
| base_url = 'https://www.bing.com/' | ||||
| search_string = 'news/search?{query}&first={offset}&format=RSS' | ||||
| search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' | ||||
| time_range_dict = {'day': '7', | ||||
|                    'week': '8', | ||||
|                    'month': '9'} | ||||
| time_range_dict = { | ||||
|     'day': '7', | ||||
|     'week': '8', | ||||
|     'month': '9' | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| # remove click | ||||
| def url_cleanup(url_string): | ||||
|     """remove click""" | ||||
| 
 | ||||
|     parsed_url = urlparse(url_string) | ||||
|     if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': | ||||
|         query = dict(parse_qsl(parsed_url.query)) | ||||
|         return query.get('url', None) | ||||
|         url_string = query.get('url', None) | ||||
|     return url_string | ||||
| 
 | ||||
| 
 | ||||
| # replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=... | ||||
| def image_url_cleanup(url_string): | ||||
|     parsed_url = urlparse(url_string) | ||||
|     if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th': | ||||
|         query = dict(parse_qsl(parsed_url.query)) | ||||
|         return "https://www.bing.com/th?id=" + query.get('id') | ||||
|     return url_string | ||||
|     """replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=...""" | ||||
| 
 | ||||
|     parsed_url = urlparse(url_string) | ||||
|     if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th': | ||||
|         query = dict(parse_qsl(parsed_url.query)) | ||||
|         url_string = "https://www.bing.com/th?id=" + quote(query.get('id')) | ||||
|     return url_string | ||||
| 
 | ||||
| def _get_url(query, language, offset, time_range): | ||||
|     if time_range in time_range_dict: | ||||
|         search_path = search_string_with_time.format( | ||||
|             query=urlencode({'q': query, 'setmkt': language}), | ||||
|             offset=offset, | ||||
|             interval=time_range_dict[time_range]) | ||||
|             query = urlencode({ | ||||
|                 'q': query, | ||||
|                 'setmkt': language | ||||
|             }), | ||||
|             offset = offset, | ||||
|             interval = time_range_dict[time_range] | ||||
|         ) | ||||
|     else: | ||||
|         # e.g. setmkt=de-de&setlang=de | ||||
|         search_path = search_string.format( | ||||
|             query=urlencode({'q': query, 'setmkt': language}), | ||||
|             offset=offset) | ||||
|             query = urlencode({ | ||||
|                 'q': query, | ||||
|                 'setmkt': language | ||||
|             }), | ||||
|             offset = offset | ||||
|         ) | ||||
|     return base_url + search_path | ||||
| 
 | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
| 
 | ||||
|     if params['time_range'] and params['time_range'] not in time_range_dict: | ||||
|         return params | ||||
| 
 | ||||
|     offset = (params['pageno'] - 1) * 10 + 1 | ||||
| 
 | ||||
|     if params['language'] == 'all': | ||||
|         language = 'en-US' | ||||
|     else: | ||||
|         language = match_language(params['language'], supported_languages, language_aliases) | ||||
| 
 | ||||
|     params['url'] = _get_url(query, language, offset, params['time_range']) | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
| 
 | ||||
|     results = [] | ||||
| 
 | ||||
|     rss = etree.fromstring(resp.content) | ||||
|     namespaces = rss.nsmap | ||||
| 
 | ||||
|     ns = rss.nsmap | ||||
| 
 | ||||
|     # parse results | ||||
|     for item in rss.xpath('./channel/item'): | ||||
|         # url / title / content | ||||
|         url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None)) | ||||
| @ -110,22 +123,26 @@ def response(resp): | ||||
|             publishedDate = datetime.now() | ||||
| 
 | ||||
|         # thumbnail | ||||
|         thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=ns), 0, default=None) | ||||
|         thumbnail = eval_xpath_getindex( | ||||
|             item, XPath('./News:Image/text()', namespaces=namespaces), 0, default=None) | ||||
|         if thumbnail is not None: | ||||
|             thumbnail = image_url_cleanup(thumbnail) | ||||
| 
 | ||||
|         # append result | ||||
|         if thumbnail is not None: | ||||
|             results.append({'url': url, | ||||
|                             'title': title, | ||||
|                             'publishedDate': publishedDate, | ||||
|                             'content': content, | ||||
|                             'img_src': thumbnail}) | ||||
|             results.append({ | ||||
|                 'url': url, | ||||
|                 'title': title, | ||||
|                 'publishedDate': publishedDate, | ||||
|                 'content': content, | ||||
|                 'img_src': thumbnail | ||||
|             }) | ||||
|         else: | ||||
|             results.append({'url': url, | ||||
|                             'title': title, | ||||
|                             'publishedDate': publishedDate, | ||||
|                             'content': content}) | ||||
|             results.append({ | ||||
|                 'url': url, | ||||
|                 'title': title, | ||||
|                 'publishedDate': publishedDate, | ||||
|                 'content': content | ||||
|             }) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user