mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 02:27:06 -04:00 
			
		
		
		
	[mod] move extract_text, extract_url to searx.utils
This commit is contained in:
		
							parent
							
								
									ecb9f28869
								
							
						
					
					
						commit
						2006eb4680
					
				| @ -1,7 +1,6 @@ | |||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| url = 'https://1337x.to/' | url = 'https://1337x.to/' | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||||
| from searx.utils import get_torrent_size, int_or_zero |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'images', 'videos', 'music'] | categories = ['files', 'images', 'videos', 'music'] | ||||||
|  | |||||||
| @ -11,7 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -13,7 +13,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -17,8 +17,7 @@ import re | |||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx import logger, utils | from searx import logger, utils | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, match_language, gen_useragent, eval_xpath | ||||||
| from searx.utils import match_language, gen_useragent, eval_xpath |  | ||||||
| 
 | 
 | ||||||
| logger = logger.getChild('bing engine') | logger = logger.getChild('bing engine') | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ | |||||||
| from lxml import html | from lxml import html | ||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['videos', 'music', 'files'] | categories = ['videos', 'music', 'files'] | ||||||
|  | |||||||
| @ -15,7 +15,7 @@ | |||||||
| from lxml import html | from lxml import html | ||||||
| import re | import re | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -12,8 +12,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urljoin | from urllib.parse import urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| categories = ['videos', 'music', 'files'] | categories = ['videos', 'music', 'files'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml.html import fromstring | from lxml.html import fromstring | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.utils import eval_xpath |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general']  # TODO , 'images', 'music', 'videos', 'files' | categories = ['general']  # TODO , 'images', 'music', 'videos', 'files' | ||||||
|  | |||||||
| @ -16,9 +16,8 @@ | |||||||
| from lxml.html import fromstring | from lxml.html import fromstring | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.poolrequests import get | from searx.poolrequests import get | ||||||
| from searx.utils import match_language, eval_xpath | from searx.utils import extract_text, match_language, eval_xpath | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -13,9 +13,8 @@ import json | |||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from re import compile | from re import compile | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases | from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases | ||||||
| from searx.utils import html_to_text, match_language | from searx.utils import extract_text, html_to_text, match_language | ||||||
| 
 | 
 | ||||||
| url = 'https://api.duckduckgo.com/'\ | url = 'https://api.duckduckgo.com/'\ | ||||||
|     + '?{query}&format=json&pretty=0&no_redirect=1&d=1' |     + '?{query}&format=json&pretty=0&no_redirect=1&d=1' | ||||||
|  | |||||||
| @ -15,12 +15,12 @@ | |||||||
| 
 | 
 | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.engines.duckduckgo import ( | from searx.engines.duckduckgo import ( | ||||||
|     _fetch_supported_languages, supported_languages_url, |     _fetch_supported_languages, supported_languages_url, | ||||||
|     get_region_code, language_aliases |     get_region_code, language_aliases | ||||||
| ) | ) | ||||||
| from searx.poolrequests import get | from searx.poolrequests import get | ||||||
|  | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['images'] | categories = ['images'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| from lxml import html, etree | from lxml import html, etree | ||||||
| import re | import re | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.utils import eval_xpath |  | ||||||
| from searx import logger | from searx import logger | ||||||
| 
 | 
 | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from urllib.parse import quote | from urllib.parse import quote | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.utils import eval_xpath |  | ||||||
| 
 | 
 | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
| paging = False | paging = False | ||||||
|  | |||||||
| @ -11,7 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files'] | categories = ['files'] | ||||||
|  | |||||||
| @ -13,7 +13,7 @@ | |||||||
| from html import escape | from html import escape | ||||||
| from urllib.parse import urljoin, urlencode | from urllib.parse import urljoin, urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -13,7 +13,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -21,9 +21,8 @@ Definitions`_. | |||||||
| from urllib.parse import urlencode, urlparse | from urllib.parse import urlencode, urlparse | ||||||
| from lxml import html | from lxml import html | ||||||
| from flask_babel import gettext | from flask_babel import gettext | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.utils import match_language, eval_xpath | from searx.utils import match_language, extract_text, eval_xpath | ||||||
| 
 | 
 | ||||||
| logger = logger.getChild('google engine') | logger = logger.getChild('google engine') | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -28,8 +28,7 @@ from urllib.parse import urlencode, urlparse, unquote | |||||||
| from lxml import html | from lxml import html | ||||||
| from flask_babel import gettext | from flask_babel import gettext | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.utils import eval_xpath | from searx.utils import extract_text, eval_xpath | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import | # pylint: disable=unused-import | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|  | |||||||
| @ -14,7 +14,7 @@ from datetime import date, timedelta | |||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -16,7 +16,7 @@ from urllib.parse import urlencode | |||||||
| from lxml import html | from lxml import html | ||||||
| from dateutil import parser | from dateutil import parser | ||||||
| from html.parser import HTMLParser | from html.parser import HTMLParser | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ | |||||||
| from lxml import html | from lxml import html | ||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size, convert_str_to_int | ||||||
| from searx.utils import get_torrent_size, convert_str_to_int |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['videos', 'music', 'files'] | categories = ['videos', 'music', 'files'] | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||||
| from searx.utils import get_torrent_size, int_or_zero |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'images', 'videos', 'music'] | categories = ['files', 'images', 'videos', 'music'] | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ from datetime import datetime | |||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| 
 | 
 | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ["videos", "music", "files"] | categories = ["videos", "music", "files"] | ||||||
|  | |||||||
| @ -12,7 +12,7 @@ from lxml import html | |||||||
| from json import loads | from json import loads | ||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from urllib.parse import quote, urljoin | from urllib.parse import quote, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| url = 'https://seedpeer.me/' | url = 'https://seedpeer.me/' | ||||||
|  | |||||||
| @ -12,7 +12,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['it'] | categories = ['it'] | ||||||
|  | |||||||
| @ -17,9 +17,8 @@ import re | |||||||
| from unicodedata import normalize, combining | from unicodedata import normalize, combining | ||||||
| from babel import Locale | from babel import Locale | ||||||
| from babel.localedata import locale_identifiers | from babel.localedata import locale_identifiers | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.languages import language_codes | from searx.languages import language_codes | ||||||
| from searx.utils import eval_xpath, match_language | from searx.utils import extract_text, eval_xpath, match_language | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -13,9 +13,8 @@ | |||||||
| import re | import re | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from searx.utils import get_torrent_size, int_or_zero | from searx.utils import extract_text, get_torrent_size, int_or_zero | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'videos', 'music'] | categories = ['files', 'videos', 'music'] | ||||||
|  | |||||||
| @ -15,8 +15,7 @@ import re | |||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['files', 'videos', 'music'] | categories = ['files', 'videos', 'music'] | ||||||
|  | |||||||
| @ -15,7 +15,7 @@ | |||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from lxml import html | from lxml import html | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['social media'] | categories = ['social media'] | ||||||
|  | |||||||
| @ -13,9 +13,8 @@ | |||||||
| 
 | 
 | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.poolrequests import get | from searx.poolrequests import get | ||||||
| from searx.engines.xpath import extract_text |  | ||||||
| from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url | from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url | ||||||
| from searx.utils import match_language, eval_xpath | from searx.utils import extract_text, match_language, eval_xpath | ||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from json import loads | from json import loads | ||||||
|  | |||||||
| @ -12,7 +12,7 @@ | |||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from urllib.parse import urlencode, urljoin | from urllib.parse import urlencode, urljoin | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['images'] | categories = ['images'] | ||||||
|  | |||||||
| @ -1,7 +1,6 @@ | |||||||
| from urllib.parse import unquote, urlencode, urljoin, urlparse |  | ||||||
| from lxml import html | from lxml import html | ||||||
| from lxml.etree import _ElementStringResult, _ElementUnicodeResult | from urllib.parse import urlencode | ||||||
| from searx.utils import html_to_text, eval_xpath | from searx.utils import extract_text, extract_url, eval_xpath | ||||||
| 
 | 
 | ||||||
| search_url = None | search_url = None | ||||||
| url_xpath = None | url_xpath = None | ||||||
| @ -21,76 +20,6 @@ page_size = 1 | |||||||
| first_page_num = 1 | first_page_num = 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ''' |  | ||||||
| if xpath_results is list, extract the text from each result and concat the list |  | ||||||
| if xpath_results is a xml element, extract all the text node from it |  | ||||||
|    ( text_content() method from lxml ) |  | ||||||
| if xpath_results is a string element, then it's already done |  | ||||||
| ''' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def extract_text(xpath_results): |  | ||||||
|     if type(xpath_results) == list: |  | ||||||
|         # it's list of result : concat everything using recursive call |  | ||||||
|         result = '' |  | ||||||
|         for e in xpath_results: |  | ||||||
|             result = result + extract_text(e) |  | ||||||
|         return result.strip() |  | ||||||
|     elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: |  | ||||||
|         # it's a string |  | ||||||
|         return ''.join(xpath_results) |  | ||||||
|     else: |  | ||||||
|         # it's a element |  | ||||||
|         text = html.tostring( |  | ||||||
|             xpath_results, encoding='unicode', method='text', with_tail=False |  | ||||||
|         ) |  | ||||||
|         text = text.strip().replace('\n', ' ') |  | ||||||
|         return ' '.join(text.split()) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def extract_url(xpath_results, search_url): |  | ||||||
|     if xpath_results == []: |  | ||||||
|         raise Exception('Empty url resultset') |  | ||||||
|     url = extract_text(xpath_results) |  | ||||||
| 
 |  | ||||||
|     if url.startswith('//'): |  | ||||||
|         # add http or https to this kind of url //example.com/ |  | ||||||
|         parsed_search_url = urlparse(search_url) |  | ||||||
|         url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) |  | ||||||
|     elif url.startswith('/'): |  | ||||||
|         # fix relative url to the search engine |  | ||||||
|         url = urljoin(search_url, url) |  | ||||||
| 
 |  | ||||||
|     # fix relative urls that fall through the crack |  | ||||||
|     if '://' not in url: |  | ||||||
|         url = urljoin(search_url, url) |  | ||||||
| 
 |  | ||||||
|     # normalize url |  | ||||||
|     url = normalize_url(url) |  | ||||||
| 
 |  | ||||||
|     return url |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def normalize_url(url): |  | ||||||
|     parsed_url = urlparse(url) |  | ||||||
| 
 |  | ||||||
|     # add a / at this end of the url if there is no path |  | ||||||
|     if not parsed_url.netloc: |  | ||||||
|         raise Exception('Cannot parse url') |  | ||||||
|     if not parsed_url.path: |  | ||||||
|         url += '/' |  | ||||||
| 
 |  | ||||||
|     # FIXME : hack for yahoo |  | ||||||
|     if parsed_url.hostname == 'search.yahoo.com'\ |  | ||||||
|        and parsed_url.path.startswith('/r'): |  | ||||||
|         p = parsed_url.path |  | ||||||
|         mark = p.find('/**') |  | ||||||
|         if mark != -1: |  | ||||||
|             return unquote(p[mark + 3:]).decode() |  | ||||||
| 
 |  | ||||||
|     return url |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     query = urlencode({'q': query})[2:] |     query = urlencode({'q': query})[2:] | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -13,8 +13,7 @@ | |||||||
| 
 | 
 | ||||||
| from urllib.parse import unquote, urlencode | from urllib.parse import unquote, urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text, extract_url | from searx.utils import extract_text, extract_url, match_language, eval_xpath | ||||||
| from searx.utils import match_language, eval_xpath |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['general'] | categories = ['general'] | ||||||
|  | |||||||
| @ -13,12 +13,11 @@ import re | |||||||
| from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.engines.xpath import extract_text, extract_url |  | ||||||
| from searx.engines.yahoo import ( | from searx.engines.yahoo import ( | ||||||
|     parse_url, _fetch_supported_languages, supported_languages_url, language_aliases |     parse_url, _fetch_supported_languages, supported_languages_url, language_aliases | ||||||
| ) | ) | ||||||
| from dateutil import parser | from dateutil import parser | ||||||
| from searx.utils import match_language | from searx.utils import extract_text, extract_url, match_language | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['news'] | categories = ['news'] | ||||||
|  | |||||||
| @ -12,8 +12,7 @@ from lxml import html | |||||||
| from operator import itemgetter | from operator import itemgetter | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from urllib.parse import quote | from urllib.parse import quote | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, get_torrent_size | ||||||
| from searx.utils import get_torrent_size |  | ||||||
| from searx.poolrequests import get as http_get | from searx.poolrequests import get as http_get | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
|  | |||||||
| @ -11,8 +11,7 @@ | |||||||
| from functools import reduce | from functools import reduce | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import quote_plus | from urllib.parse import quote_plus | ||||||
| from searx.engines.xpath import extract_text | from searx.utils import extract_text, list_get | ||||||
| from searx.utils import list_get |  | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['videos', 'music'] | categories = ['videos', 'music'] | ||||||
|  | |||||||
| @ -10,9 +10,13 @@ from os.path import splitext, join | |||||||
| from io import open | from io import open | ||||||
| from random import choice | from random import choice | ||||||
| from html.parser import HTMLParser | from html.parser import HTMLParser | ||||||
| from lxml.etree import XPath | from urllib.parse import urljoin, urlparse, unquote | ||||||
|  | 
 | ||||||
|  | from lxml import html | ||||||
|  | from lxml.etree import XPath, _ElementStringResult, _ElementUnicodeResult | ||||||
| from babel.core import get_global | from babel.core import get_global | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| from searx import settings | from searx import settings | ||||||
| from searx.version import VERSION_STRING | from searx.version import VERSION_STRING | ||||||
| from searx.languages import language_codes | from searx.languages import language_codes | ||||||
| @ -106,6 +110,74 @@ def html_to_text(html): | |||||||
|     return s.get_text() |     return s.get_text() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def extract_text(xpath_results): | ||||||
|  |     ''' | ||||||
|  |     if xpath_results is list, extract the text from each result and concat the list | ||||||
|  |     if xpath_results is a xml element, extract all the text node from it | ||||||
|  |     ( text_content() method from lxml ) | ||||||
|  |     if xpath_results is a string element, then it's already done | ||||||
|  |     ''' | ||||||
|  |     if type(xpath_results) == list: | ||||||
|  |         # it's list of result : concat everything using recursive call | ||||||
|  |         result = '' | ||||||
|  |         for e in xpath_results: | ||||||
|  |             result = result + extract_text(e) | ||||||
|  |         return result.strip() | ||||||
|  |     elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: | ||||||
|  |         # it's a string | ||||||
|  |         return ''.join(xpath_results) | ||||||
|  |     else: | ||||||
|  |         # it's a element | ||||||
|  |         text = html.tostring( | ||||||
|  |             xpath_results, encoding='unicode', method='text', with_tail=False | ||||||
|  |         ) | ||||||
|  |         text = text.strip().replace('\n', ' ') | ||||||
|  |         return ' '.join(text.split()) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def extract_url(xpath_results, search_url): | ||||||
|  |     if xpath_results == []: | ||||||
|  |         raise Exception('Empty url resultset') | ||||||
|  |     url = extract_text(xpath_results) | ||||||
|  | 
 | ||||||
|  |     if url.startswith('//'): | ||||||
|  |         # add http or https to this kind of url //example.com/ | ||||||
|  |         parsed_search_url = urlparse(search_url) | ||||||
|  |         url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) | ||||||
|  |     elif url.startswith('/'): | ||||||
|  |         # fix relative url to the search engine | ||||||
|  |         url = urljoin(search_url, url) | ||||||
|  | 
 | ||||||
|  |     # fix relative urls that fall through the crack | ||||||
|  |     if '://' not in url: | ||||||
|  |         url = urljoin(search_url, url) | ||||||
|  | 
 | ||||||
|  |     # normalize url | ||||||
|  |     url = normalize_url(url) | ||||||
|  | 
 | ||||||
|  |     return url | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def normalize_url(url): | ||||||
|  |     parsed_url = urlparse(url) | ||||||
|  | 
 | ||||||
|  |     # add a / at this end of the url if there is no path | ||||||
|  |     if not parsed_url.netloc: | ||||||
|  |         raise Exception('Cannot parse url') | ||||||
|  |     if not parsed_url.path: | ||||||
|  |         url += '/' | ||||||
|  | 
 | ||||||
|  |     # FIXME : hack for yahoo | ||||||
|  |     if parsed_url.hostname == 'search.yahoo.com'\ | ||||||
|  |        and parsed_url.path.startswith('/r'): | ||||||
|  |         p = parsed_url.path | ||||||
|  |         mark = p.find('/**') | ||||||
|  |         if mark != -1: | ||||||
|  |             return unquote(p[mark + 3:]).decode() | ||||||
|  | 
 | ||||||
|  |     return url | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def dict_subset(d, properties): | def dict_subset(d, properties): | ||||||
|     result = {} |     result = {} | ||||||
|     for k in properties: |     for k in properties: | ||||||
|  | |||||||
| @ -1,4 +1,7 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  | import lxml.etree | ||||||
|  | from lxml import html | ||||||
|  | 
 | ||||||
| from searx.testing import SearxTestCase | from searx.testing import SearxTestCase | ||||||
| from searx import utils | from searx import utils | ||||||
| 
 | 
 | ||||||
| @ -16,7 +19,30 @@ class TestUtils(SearxTestCase): | |||||||
|         self.assertTrue(utils.searx_useragent().startswith('searx')) |         self.assertTrue(utils.searx_useragent().startswith('searx')) | ||||||
| 
 | 
 | ||||||
|     def test_html_to_text(self): |     def test_html_to_text(self): | ||||||
|         html = """ |         html_str = """ | ||||||
|  |         <a href="/testlink" class="link_access_account"> | ||||||
|  |             <style> | ||||||
|  |                 .toto { | ||||||
|  |                     color: red; | ||||||
|  |                 } | ||||||
|  |             </style> | ||||||
|  |             <span class="toto"> | ||||||
|  |                 <span> | ||||||
|  |                     <img src="test.jpg" /> | ||||||
|  |                 </span> | ||||||
|  |             </span> | ||||||
|  |             <span class="titi"> | ||||||
|  |                             Test text | ||||||
|  |             </span> | ||||||
|  |             <script>value='dummy';</script> | ||||||
|  |         </a> | ||||||
|  |         """ | ||||||
|  |         self.assertIsInstance(utils.html_to_text(html_str), str) | ||||||
|  |         self.assertIsNotNone(utils.html_to_text(html_str)) | ||||||
|  |         self.assertEqual(utils.html_to_text(html_str), "Test text") | ||||||
|  | 
 | ||||||
|  |     def test_extract_text(self): | ||||||
|  |         html_str = """ | ||||||
|         <a href="/testlink" class="link_access_account"> |         <a href="/testlink" class="link_access_account"> | ||||||
|             <span class="toto"> |             <span class="toto"> | ||||||
|                 <span> |                 <span> | ||||||
| @ -28,9 +54,24 @@ class TestUtils(SearxTestCase): | |||||||
|             </span> |             </span> | ||||||
|         </a> |         </a> | ||||||
|         """ |         """ | ||||||
|         self.assertIsInstance(utils.html_to_text(html), str) |         dom = html.fromstring(html_str) | ||||||
|         self.assertIsNotNone(utils.html_to_text(html)) |         self.assertEqual(utils.extract_text(dom), 'Test text') | ||||||
|         self.assertEqual(utils.html_to_text(html), "Test text") |         self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text') | ||||||
|  |         self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg') | ||||||
|  |         self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '') | ||||||
|  | 
 | ||||||
|  |     def test_extract_url(self): | ||||||
|  |         def f(html_str, search_url): | ||||||
|  |             return utils.extract_url(html.fromstring(html_str), search_url) | ||||||
|  |         self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/') | ||||||
|  |         self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/') | ||||||
|  |         self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/') | ||||||
|  |         self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/') | ||||||
|  |         self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1') | ||||||
|  |         with self.assertRaises(lxml.etree.ParserError): | ||||||
|  |             f('', 'https://example.com') | ||||||
|  |         with self.assertRaises(Exception): | ||||||
|  |             utils.extract_url([], 'https://example.com') | ||||||
| 
 | 
 | ||||||
|     def test_html_to_text_invalid(self): |     def test_html_to_text_invalid(self): | ||||||
|         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' |         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user