mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-25 15:52:31 -04:00 
			
		
		
		
	[feat] startpage: support for news and images
This commit is contained in:
		
							parent
							
								
									feb15e3878
								
							
						
					
					
						commit
						0f2fc5879d
					
				| @ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by | ||||
| 
 | ||||
| .. hint:: | ||||
| 
 | ||||
|    The default category is ``web`` .. and other categories than ``web`` are not | ||||
|    yet implemented. | ||||
|   Supported categories are ``web``, ``news`` and ``images``. | ||||
| 
 | ||||
| """ | ||||
| # pylint: disable=too-many-statements | ||||
| from __future__ import annotations | ||||
| 
 | ||||
| from typing import TYPE_CHECKING | ||||
| from typing import TYPE_CHECKING, Any | ||||
| from collections import OrderedDict | ||||
| import re | ||||
| from unicodedata import normalize, combining | ||||
| from time import time | ||||
| from datetime import datetime, timedelta | ||||
| from json import loads | ||||
| 
 | ||||
| import dateutil.parser | ||||
| import lxml.html | ||||
| import babel.localedata | ||||
| 
 | ||||
| from searx.utils import extract_text, eval_xpath, gen_useragent | ||||
| from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str | ||||
| from searx.network import get  # see https://github.com/searxng/searxng/issues/762 | ||||
| from searx.exceptions import SearxEngineCaptchaException | ||||
| from searx.locales import region_tag | ||||
| @ -250,22 +251,13 @@ def request(query, params): | ||||
|     Additionally the arguments form Startpage's search form needs to be set in | ||||
|     HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`. | ||||
|     """ | ||||
|     if startpage_categ == 'web': | ||||
|         return _request_cat_web(query, params) | ||||
| 
 | ||||
|     logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def _request_cat_web(query, params): | ||||
| 
 | ||||
|     engine_region = traits.get_region(params['searxng_locale'], 'en-US') | ||||
|     engine_language = traits.get_language(params['searxng_locale'], 'en') | ||||
| 
 | ||||
|     # build arguments | ||||
|     args = { | ||||
|         'query': query, | ||||
|         'cat': 'web', | ||||
|         'cat': startpage_categ, | ||||
|         't': 'device', | ||||
|         'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers, | ||||
|         'with_date': time_range_dict.get(params['time_range'], ''), | ||||
| @ -317,40 +309,7 @@ def _request_cat_web(query, params): | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
|     dom = lxml.html.fromstring(resp.text) | ||||
| 
 | ||||
|     if startpage_categ == 'web': | ||||
|         return _response_cat_web(dom) | ||||
| 
 | ||||
|     logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) | ||||
|     return [] | ||||
| 
 | ||||
| 
 | ||||
| def _response_cat_web(dom): | ||||
|     results = [] | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'): | ||||
|         links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]') | ||||
|         if not links: | ||||
|             continue | ||||
|         link = links[0] | ||||
|         url = link.attrib.get('href') | ||||
| 
 | ||||
|         # block google-ad url's | ||||
|         if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): | ||||
|             continue | ||||
| 
 | ||||
|         # block startpage search url's | ||||
|         if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): | ||||
|             continue | ||||
| 
 | ||||
|         title = extract_text(eval_xpath(link, 'h2')) | ||||
|         content = eval_xpath(result, './/p[contains(@class, "description")]') | ||||
|         content = extract_text(content, allow_none=True) or '' | ||||
| 
 | ||||
| def _parse_published_date(content: str) -> tuple[str, datetime | None]: | ||||
|     published_date = None | ||||
| 
 | ||||
|     # check if search result starts with something like: "2 Sep 2014 ... " | ||||
| @ -376,14 +335,92 @@ def _response_cat_web(dom): | ||||
|         # fix content string | ||||
|         content = content[date_pos:] | ||||
| 
 | ||||
|         if published_date: | ||||
|             # append result | ||||
|             results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date}) | ||||
|         else: | ||||
|             # append result | ||||
|             results.append({'url': url, 'title': title, 'content': content}) | ||||
|     return content, published_date | ||||
| 
 | ||||
| 
 | ||||
| def _get_web_result(result): | ||||
|     content = html_to_text(result.get('description')) | ||||
|     content, publishedDate = _parse_published_date(content) | ||||
| 
 | ||||
|     return { | ||||
|         'url': result['clickUrl'], | ||||
|         'title': html_to_text(result['title']), | ||||
|         'content': content, | ||||
|         'publishedDate': publishedDate, | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def _get_news_result(result): | ||||
| 
 | ||||
|     title = remove_pua_from_str(html_to_text(result['title'])) | ||||
|     content = remove_pua_from_str(html_to_text(result.get('description'))) | ||||
| 
 | ||||
|     publishedDate = None | ||||
|     if result.get('date'): | ||||
|         publishedDate = datetime.fromtimestamp(result['date'] / 1000) | ||||
| 
 | ||||
|     thumbnailUrl = None | ||||
|     if result.get('thumbnailUrl'): | ||||
|         thumbnailUrl = base_url + result['thumbnailUrl'] | ||||
| 
 | ||||
|     return { | ||||
|         'url': result['clickUrl'], | ||||
|         'title': title, | ||||
|         'content': content, | ||||
|         'publishedDate': publishedDate, | ||||
|         'thumbnail': thumbnailUrl, | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def _get_image_result(result) -> dict[str, Any] | None: | ||||
|     url = result.get('altClickUrl') | ||||
|     if not url: | ||||
|         return None | ||||
| 
 | ||||
|     thumbnailUrl = None | ||||
|     if result.get('thumbnailUrl'): | ||||
|         thumbnailUrl = base_url + result['thumbnailUrl'] | ||||
| 
 | ||||
|     resolution = None | ||||
|     if result.get('width') and result.get('height'): | ||||
|         resolution = f"{result['width']}x{result['height']}" | ||||
| 
 | ||||
|     filesize = None | ||||
|     if result.get('filesize'): | ||||
|         size_str = ''.join(filter(str.isdigit, result['filesize'])) | ||||
|         filesize = humanize_bytes(int(size_str)) | ||||
| 
 | ||||
|     return { | ||||
|         'template': 'images.html', | ||||
|         'url': url, | ||||
|         'title': html_to_text(result['title']), | ||||
|         'content': '', | ||||
|         'img_src': result.get('rawImageUrl'), | ||||
|         'thumbnail_src': thumbnailUrl, | ||||
|         'resolution': resolution, | ||||
|         'img_format': result.get('format'), | ||||
|         'filesize': filesize, | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def response(resp): | ||||
|     categ = startpage_categ.capitalize() | ||||
|     results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}' | ||||
|     results_json = loads(results_raw) | ||||
|     results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {}) | ||||
| 
 | ||||
|     results = [] | ||||
|     for results_categ in results_obj.get('mainline', []): | ||||
|         for item in results_categ.get('results', []): | ||||
|             if results_categ['display_type'] == 'web-google': | ||||
|                 results.append(_get_web_result(item)) | ||||
|             elif results_categ['display_type'] == 'news-bing': | ||||
|                 results.append(_get_news_result(item)) | ||||
|             elif 'images' in results_categ['display_type']: | ||||
|                 item = _get_image_result(item) | ||||
|                 if item: | ||||
|                     results.append(item) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -1792,6 +1792,20 @@ engines: | ||||
|     additional_tests: | ||||
|       rosebud: *test_rosebud | ||||
| 
 | ||||
|   - name: startpage news | ||||
|     engine: startpage | ||||
|     startpage_categ: news | ||||
|     shortcut: spn | ||||
|     timeout: 6.0 | ||||
|     disabled: true | ||||
| 
 | ||||
|   - name: startpage images | ||||
|     engine: startpage | ||||
|     startpage_categ: images | ||||
|     shortcut: spi | ||||
|     timeout: 6.0 | ||||
|     disabled: true | ||||
| 
 | ||||
|   - name: tokyotoshokan | ||||
|     engine: tokyotoshokan | ||||
|     shortcut: tt | ||||
|  | ||||
| @ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str: | ||||
|     return string | ||||
| 
 | ||||
| 
 | ||||
| def remove_pua_from_str(string): | ||||
|     """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string. | ||||
| 
 | ||||
|     _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas | ||||
|     """ | ||||
|     pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)) | ||||
|     s = [] | ||||
|     for c in string: | ||||
|         i = ord(c) | ||||
|         if any(a <= i <= b for (a, b) in pua_ranges): | ||||
|             continue | ||||
|         s.append(c) | ||||
|     return "".join(s) | ||||
| 
 | ||||
| 
 | ||||
| def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]: | ||||
|     rep = {re.escape(k): v for k, v in replaces.items()} | ||||
|     pattern = re.compile("|".join(rep.keys())) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user