mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 02:27:06 -04:00 
			
		
		
		
	[mod] move extract_text, extract_url to searx.utils
This commit is contained in:
		
							parent
							
								
									ecb9f28869
								
							
						
					
					
						commit
						2006eb4680
					
				| @ -1,7 +1,6 @@ | ||||
| from urllib.parse import quote, urljoin | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size | ||||
| from searx.utils import extract_text, get_torrent_size | ||||
| 
 | ||||
| 
 | ||||
| url = 'https://1337x.to/' | ||||
|  | ||||
| @ -11,8 +11,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size, int_or_zero | ||||
| from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['files', 'images', 'videos', 'music'] | ||||
|  | ||||
| @ -11,7 +11,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| 
 | ||||
| # engine dependent config | ||||
|  | ||||
| @ -13,7 +13,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urlencode, urljoin | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['it'] | ||||
|  | ||||
| @ -17,8 +17,7 @@ import re | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx import logger, utils | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import match_language, gen_useragent, eval_xpath | ||||
| from searx.utils import extract_text, match_language, gen_useragent, eval_xpath | ||||
| 
 | ||||
| logger = logger.getChild('bing engine') | ||||
| 
 | ||||
|  | ||||
| @ -13,8 +13,7 @@ | ||||
| from lxml import html | ||||
| from operator import itemgetter | ||||
| from urllib.parse import quote, urljoin | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size | ||||
| from searx.utils import extract_text, get_torrent_size | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['videos', 'music', 'files'] | ||||
|  | ||||
| @ -15,7 +15,7 @@ | ||||
| from lxml import html | ||||
| import re | ||||
| from urllib.parse import urlencode | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| 
 | ||||
| # engine dependent config | ||||
|  | ||||
| @ -12,8 +12,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urljoin | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size | ||||
| from searx.utils import extract_text, get_torrent_size | ||||
| 
 | ||||
| 
 | ||||
| categories = ['videos', 'music', 'files'] | ||||
|  | ||||
| @ -11,8 +11,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urlencode | ||||
| from lxml.html import fromstring | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import eval_xpath | ||||
| from searx.utils import extract_text, eval_xpath | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['general']  # TODO , 'images', 'music', 'videos', 'files' | ||||
|  | ||||
| @ -16,9 +16,8 @@ | ||||
| from lxml.html import fromstring | ||||
| from json import loads | ||||
| from urllib.parse import urlencode | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.poolrequests import get | ||||
| from searx.utils import match_language, eval_xpath | ||||
| from searx.utils import extract_text, match_language, eval_xpath | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['general'] | ||||
|  | ||||
| @ -13,9 +13,8 @@ import json | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from re import compile | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases | ||||
| from searx.utils import html_to_text, match_language | ||||
| from searx.utils import extract_text, html_to_text, match_language | ||||
| 
 | ||||
| url = 'https://api.duckduckgo.com/'\ | ||||
|     + '?{query}&format=json&pretty=0&no_redirect=1&d=1' | ||||
|  | ||||
| @ -15,12 +15,12 @@ | ||||
| 
 | ||||
| from json import loads | ||||
| from urllib.parse import urlencode | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.engines.duckduckgo import ( | ||||
|     _fetch_supported_languages, supported_languages_url, | ||||
|     get_region_code, language_aliases | ||||
| ) | ||||
| from searx.poolrequests import get | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['images'] | ||||
|  | ||||
| @ -11,8 +11,7 @@ | ||||
| from lxml import html, etree | ||||
| import re | ||||
| from urllib.parse import quote, urljoin | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import eval_xpath | ||||
| from searx.utils import extract_text, eval_xpath | ||||
| from searx import logger | ||||
| 
 | ||||
| categories = ['general'] | ||||
|  | ||||
| @ -11,8 +11,7 @@ | ||||
| 
 | ||||
| from lxml import html | ||||
| from urllib.parse import quote | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import eval_xpath | ||||
| from searx.utils import extract_text, eval_xpath | ||||
| 
 | ||||
| categories = ['general'] | ||||
| paging = False | ||||
|  | ||||
| @ -11,7 +11,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['files'] | ||||
|  | ||||
| @ -13,7 +13,7 @@ | ||||
| from html import escape | ||||
| from urllib.parse import urljoin, urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['it'] | ||||
|  | ||||
| @ -13,7 +13,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urlencode, urljoin | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['it'] | ||||
|  | ||||
| @ -21,9 +21,8 @@ Definitions`_. | ||||
| from urllib.parse import urlencode, urlparse | ||||
| from lxml import html | ||||
| from flask_babel import gettext | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx import logger | ||||
| from searx.utils import match_language, eval_xpath | ||||
| from searx.utils import match_language, extract_text, eval_xpath | ||||
| 
 | ||||
| logger = logger.getChild('google engine') | ||||
| 
 | ||||
|  | ||||
| @ -28,8 +28,7 @@ from urllib.parse import urlencode, urlparse, unquote | ||||
| from lxml import html | ||||
| from flask_babel import gettext | ||||
| from searx import logger | ||||
| from searx.utils import eval_xpath | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text, eval_xpath | ||||
| 
 | ||||
| # pylint: disable=unused-import | ||||
| from searx.engines.google import ( | ||||
|  | ||||
| @ -14,7 +14,7 @@ from datetime import date, timedelta | ||||
| from json import loads | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| import re | ||||
| 
 | ||||
| # engine dependent config | ||||
|  | ||||
| @ -16,7 +16,7 @@ from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from dateutil import parser | ||||
| from html.parser import HTMLParser | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| 
 | ||||
| # engine dependent config | ||||
|  | ||||
| @ -13,8 +13,7 @@ | ||||
| from lxml import html | ||||
| from operator import itemgetter | ||||
| from urllib.parse import quote, urljoin | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size, convert_str_to_int | ||||
| from searx.utils import extract_text, get_torrent_size, convert_str_to_int | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['videos', 'music', 'files'] | ||||
|  | ||||
| @ -11,8 +11,7 @@ | ||||
| 
 | ||||
| from lxml import html | ||||
| from urllib.parse import urlencode | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size, int_or_zero | ||||
| from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['files', 'images', 'videos', 'music'] | ||||
|  | ||||
| @ -13,8 +13,7 @@ from datetime import datetime | ||||
| from operator import itemgetter | ||||
| 
 | ||||
| from urllib.parse import quote, urljoin | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size | ||||
| from searx.utils import extract_text, get_torrent_size | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ["videos", "music", "files"] | ||||
|  | ||||
| @ -12,7 +12,7 @@ from lxml import html | ||||
| from json import loads | ||||
| from operator import itemgetter | ||||
| from urllib.parse import quote, urljoin | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| 
 | ||||
| url = 'https://seedpeer.me/' | ||||
|  | ||||
| @ -12,7 +12,7 @@ | ||||
| 
 | ||||
| from urllib.parse import urlencode, urljoin | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['it'] | ||||
|  | ||||
| @ -17,9 +17,8 @@ import re | ||||
| from unicodedata import normalize, combining | ||||
| from babel import Locale | ||||
| from babel.localedata import locale_identifiers | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.languages import language_codes | ||||
| from searx.utils import eval_xpath, match_language | ||||
| from searx.utils import extract_text, eval_xpath, match_language | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['general'] | ||||
|  | ||||
| @ -13,9 +13,8 @@ | ||||
| import re | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text | ||||
| from datetime import datetime | ||||
| from searx.utils import get_torrent_size, int_or_zero | ||||
| from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['files', 'videos', 'music'] | ||||
|  | ||||
| @ -15,8 +15,7 @@ import re | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from datetime import datetime | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size | ||||
| from searx.utils import extract_text, get_torrent_size | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['files', 'videos', 'music'] | ||||
|  | ||||
| @ -15,7 +15,7 @@ | ||||
| from urllib.parse import urlencode, urljoin | ||||
| from lxml import html | ||||
| from datetime import datetime | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['social media'] | ||||
|  | ||||
| @ -13,9 +13,8 @@ | ||||
| 
 | ||||
| from searx import logger | ||||
| from searx.poolrequests import get | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url | ||||
| from searx.utils import match_language, eval_xpath | ||||
| from searx.utils import extract_text, match_language, eval_xpath | ||||
| 
 | ||||
| from urllib.parse import urlencode | ||||
| from json import loads | ||||
|  | ||||
| @ -12,7 +12,7 @@ | ||||
| 
 | ||||
| from lxml import html | ||||
| from urllib.parse import urlencode, urljoin | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import extract_text | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['images'] | ||||
|  | ||||
| @ -1,7 +1,6 @@ | ||||
| from urllib.parse import unquote, urlencode, urljoin, urlparse | ||||
| from lxml import html | ||||
| from lxml.etree import _ElementStringResult, _ElementUnicodeResult | ||||
| from searx.utils import html_to_text, eval_xpath | ||||
| from urllib.parse import urlencode | ||||
| from searx.utils import extract_text, extract_url, eval_xpath | ||||
| 
 | ||||
| search_url = None | ||||
| url_xpath = None | ||||
| @ -21,76 +20,6 @@ page_size = 1 | ||||
| first_page_num = 1 | ||||
| 
 | ||||
| 
 | ||||
| ''' | ||||
| if xpath_results is list, extract the text from each result and concat the list | ||||
| if xpath_results is a xml element, extract all the text node from it | ||||
|    ( text_content() method from lxml ) | ||||
| if xpath_results is a string element, then it's already done | ||||
| ''' | ||||
| 
 | ||||
| 
 | ||||
| def extract_text(xpath_results): | ||||
|     if type(xpath_results) == list: | ||||
|         # it's list of result : concat everything using recursive call | ||||
|         result = '' | ||||
|         for e in xpath_results: | ||||
|             result = result + extract_text(e) | ||||
|         return result.strip() | ||||
|     elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: | ||||
|         # it's a string | ||||
|         return ''.join(xpath_results) | ||||
|     else: | ||||
|         # it's a element | ||||
|         text = html.tostring( | ||||
|             xpath_results, encoding='unicode', method='text', with_tail=False | ||||
|         ) | ||||
|         text = text.strip().replace('\n', ' ') | ||||
|         return ' '.join(text.split()) | ||||
| 
 | ||||
| 
 | ||||
| def extract_url(xpath_results, search_url): | ||||
|     if xpath_results == []: | ||||
|         raise Exception('Empty url resultset') | ||||
|     url = extract_text(xpath_results) | ||||
| 
 | ||||
|     if url.startswith('//'): | ||||
|         # add http or https to this kind of url //example.com/ | ||||
|         parsed_search_url = urlparse(search_url) | ||||
|         url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) | ||||
|     elif url.startswith('/'): | ||||
|         # fix relative url to the search engine | ||||
|         url = urljoin(search_url, url) | ||||
| 
 | ||||
|     # fix relative urls that fall through the crack | ||||
|     if '://' not in url: | ||||
|         url = urljoin(search_url, url) | ||||
| 
 | ||||
|     # normalize url | ||||
|     url = normalize_url(url) | ||||
| 
 | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def normalize_url(url): | ||||
|     parsed_url = urlparse(url) | ||||
| 
 | ||||
|     # add a / at this end of the url if there is no path | ||||
|     if not parsed_url.netloc: | ||||
|         raise Exception('Cannot parse url') | ||||
|     if not parsed_url.path: | ||||
|         url += '/' | ||||
| 
 | ||||
|     # FIXME : hack for yahoo | ||||
|     if parsed_url.hostname == 'search.yahoo.com'\ | ||||
|        and parsed_url.path.startswith('/r'): | ||||
|         p = parsed_url.path | ||||
|         mark = p.find('/**') | ||||
|         if mark != -1: | ||||
|             return unquote(p[mark + 3:]).decode() | ||||
| 
 | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def request(query, params): | ||||
|     query = urlencode({'q': query})[2:] | ||||
| 
 | ||||
|  | ||||
| @ -13,8 +13,7 @@ | ||||
| 
 | ||||
| from urllib.parse import unquote, urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text, extract_url | ||||
| from searx.utils import match_language, eval_xpath | ||||
| from searx.utils import extract_text, extract_url, match_language, eval_xpath | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['general'] | ||||
|  | ||||
| @ -13,12 +13,11 @@ import re | ||||
| from datetime import datetime, timedelta | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text, extract_url | ||||
| from searx.engines.yahoo import ( | ||||
|     parse_url, _fetch_supported_languages, supported_languages_url, language_aliases | ||||
| ) | ||||
| from dateutil import parser | ||||
| from searx.utils import match_language | ||||
| from searx.utils import extract_text, extract_url, match_language | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['news'] | ||||
|  | ||||
| @ -12,8 +12,7 @@ from lxml import html | ||||
| from operator import itemgetter | ||||
| from datetime import datetime | ||||
| from urllib.parse import quote | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import get_torrent_size | ||||
| from searx.utils import extract_text, get_torrent_size | ||||
| from searx.poolrequests import get as http_get | ||||
| 
 | ||||
| # engine dependent config | ||||
|  | ||||
| @ -11,8 +11,7 @@ | ||||
| from functools import reduce | ||||
| from json import loads | ||||
| from urllib.parse import quote_plus | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.utils import list_get | ||||
| from searx.utils import extract_text, list_get | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['videos', 'music'] | ||||
|  | ||||
| @ -10,9 +10,13 @@ from os.path import splitext, join | ||||
| from io import open | ||||
| from random import choice | ||||
| from html.parser import HTMLParser | ||||
| from lxml.etree import XPath | ||||
| from urllib.parse import urljoin, urlparse, unquote | ||||
| 
 | ||||
| from lxml import html | ||||
| from lxml.etree import XPath, _ElementStringResult, _ElementUnicodeResult | ||||
| from babel.core import get_global | ||||
| 
 | ||||
| 
 | ||||
| from searx import settings | ||||
| from searx.version import VERSION_STRING | ||||
| from searx.languages import language_codes | ||||
| @ -106,6 +110,74 @@ def html_to_text(html): | ||||
|     return s.get_text() | ||||
| 
 | ||||
| 
 | ||||
| def extract_text(xpath_results): | ||||
|     ''' | ||||
|     if xpath_results is list, extract the text from each result and concat the list | ||||
|     if xpath_results is a xml element, extract all the text node from it | ||||
|     ( text_content() method from lxml ) | ||||
|     if xpath_results is a string element, then it's already done | ||||
|     ''' | ||||
|     if type(xpath_results) == list: | ||||
|         # it's list of result : concat everything using recursive call | ||||
|         result = '' | ||||
|         for e in xpath_results: | ||||
|             result = result + extract_text(e) | ||||
|         return result.strip() | ||||
|     elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: | ||||
|         # it's a string | ||||
|         return ''.join(xpath_results) | ||||
|     else: | ||||
|         # it's a element | ||||
|         text = html.tostring( | ||||
|             xpath_results, encoding='unicode', method='text', with_tail=False | ||||
|         ) | ||||
|         text = text.strip().replace('\n', ' ') | ||||
|         return ' '.join(text.split()) | ||||
| 
 | ||||
| 
 | ||||
| def extract_url(xpath_results, search_url): | ||||
|     if xpath_results == []: | ||||
|         raise Exception('Empty url resultset') | ||||
|     url = extract_text(xpath_results) | ||||
| 
 | ||||
|     if url.startswith('//'): | ||||
|         # add http or https to this kind of url //example.com/ | ||||
|         parsed_search_url = urlparse(search_url) | ||||
|         url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) | ||||
|     elif url.startswith('/'): | ||||
|         # fix relative url to the search engine | ||||
|         url = urljoin(search_url, url) | ||||
| 
 | ||||
|     # fix relative urls that fall through the crack | ||||
|     if '://' not in url: | ||||
|         url = urljoin(search_url, url) | ||||
| 
 | ||||
|     # normalize url | ||||
|     url = normalize_url(url) | ||||
| 
 | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def normalize_url(url): | ||||
|     parsed_url = urlparse(url) | ||||
| 
 | ||||
|     # add a / at this end of the url if there is no path | ||||
|     if not parsed_url.netloc: | ||||
|         raise Exception('Cannot parse url') | ||||
|     if not parsed_url.path: | ||||
|         url += '/' | ||||
| 
 | ||||
|     # FIXME : hack for yahoo | ||||
|     if parsed_url.hostname == 'search.yahoo.com'\ | ||||
|        and parsed_url.path.startswith('/r'): | ||||
|         p = parsed_url.path | ||||
|         mark = p.find('/**') | ||||
|         if mark != -1: | ||||
|             return unquote(p[mark + 3:]).decode() | ||||
| 
 | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def dict_subset(d, properties): | ||||
|     result = {} | ||||
|     for k in properties: | ||||
|  | ||||
| @ -1,4 +1,7 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
| import lxml.etree | ||||
| from lxml import html | ||||
| 
 | ||||
| from searx.testing import SearxTestCase | ||||
| from searx import utils | ||||
| 
 | ||||
| @ -16,7 +19,30 @@ class TestUtils(SearxTestCase): | ||||
|         self.assertTrue(utils.searx_useragent().startswith('searx')) | ||||
| 
 | ||||
|     def test_html_to_text(self): | ||||
|         html = """ | ||||
|         html_str = """ | ||||
|         <a href="/testlink" class="link_access_account"> | ||||
|             <style> | ||||
|                 .toto { | ||||
|                     color: red; | ||||
|                 } | ||||
|             </style> | ||||
|             <span class="toto"> | ||||
|                 <span> | ||||
|                     <img src="test.jpg" /> | ||||
|                 </span> | ||||
|             </span> | ||||
|             <span class="titi"> | ||||
|                             Test text | ||||
|             </span> | ||||
|             <script>value='dummy';</script> | ||||
|         </a> | ||||
|         """ | ||||
|         self.assertIsInstance(utils.html_to_text(html_str), str) | ||||
|         self.assertIsNotNone(utils.html_to_text(html_str)) | ||||
|         self.assertEqual(utils.html_to_text(html_str), "Test text") | ||||
| 
 | ||||
|     def test_extract_text(self): | ||||
|         html_str = """ | ||||
|         <a href="/testlink" class="link_access_account"> | ||||
|             <span class="toto"> | ||||
|                 <span> | ||||
| @ -28,9 +54,24 @@ class TestUtils(SearxTestCase): | ||||
|             </span> | ||||
|         </a> | ||||
|         """ | ||||
|         self.assertIsInstance(utils.html_to_text(html), str) | ||||
|         self.assertIsNotNone(utils.html_to_text(html)) | ||||
|         self.assertEqual(utils.html_to_text(html), "Test text") | ||||
|         dom = html.fromstring(html_str) | ||||
|         self.assertEqual(utils.extract_text(dom), 'Test text') | ||||
|         self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text') | ||||
|         self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg') | ||||
|         self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '') | ||||
| 
 | ||||
|     def test_extract_url(self): | ||||
|         def f(html_str, search_url): | ||||
|             return utils.extract_url(html.fromstring(html_str), search_url) | ||||
|         self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/') | ||||
|         self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/') | ||||
|         self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/') | ||||
|         self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/') | ||||
|         self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1') | ||||
|         with self.assertRaises(lxml.etree.ParserError): | ||||
|             f('', 'https://example.com') | ||||
|         with self.assertRaises(Exception): | ||||
|             utils.extract_url([], 'https://example.com') | ||||
| 
 | ||||
|     def test_html_to_text_invalid(self): | ||||
|         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user