mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-04 03:27:06 -05:00 
			
		
		
		
	[fix] revision of utils.HTMLTextExtractor (#5125)
Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
This commit is contained in:
		
							parent
							
								
									b606103352
								
							
						
					
					
						commit
						4fb6105d69
					
				@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
 | 
				
			|||||||
    return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
 | 
					    return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class _HTMLTextExtractorException(Exception):
 | 
					class HTMLTextExtractor(HTMLParser):
 | 
				
			||||||
    """Internal exception raised when the HTML is invalid"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class _HTMLTextExtractor(HTMLParser):
 | 
					 | 
				
			||||||
    """Internal class to extract text from HTML"""
 | 
					    """Internal class to extract text from HTML"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self):
 | 
					    def __init__(self):
 | 
				
			||||||
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
 | 
				
			|||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if tag != self.tags[-1]:
 | 
					        if tag != self.tags[-1]:
 | 
				
			||||||
            raise _HTMLTextExtractorException()
 | 
					            self.result.append(f"</{tag}>")
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.tags.pop()
 | 
					        self.tags.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
 | 
				
			|||||||
        >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
 | 
					        >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
 | 
				
			||||||
        'Example'
 | 
					        'Example'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
 | 
					        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
 | 
				
			||||||
        'regexp: (?<![a-zA-Z]'
 | 
					        'regexp: (?<![a-zA-Z]'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
 | 
				
			||||||
 | 
					        'Lorem ipsum </i>dolor sit amet</p>'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        >>> html_to_text(r'> < a')
 | 
				
			||||||
 | 
					        '> < a'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if not html_str:
 | 
					    if not html_str:
 | 
				
			||||||
        return ""
 | 
					        return ""
 | 
				
			||||||
    html_str = html_str.replace('\n', ' ').replace('\r', ' ')
 | 
					    html_str = html_str.replace('\n', ' ').replace('\r', ' ')
 | 
				
			||||||
    html_str = ' '.join(html_str.split())
 | 
					    html_str = ' '.join(html_str.split())
 | 
				
			||||||
    s = _HTMLTextExtractor()
 | 
					    s = HTMLTextExtractor()
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        s.feed(html_str)
 | 
					        s.feed(html_str)
 | 
				
			||||||
        s.close()
 | 
					        s.close()
 | 
				
			||||||
    except AssertionError:
 | 
					    except AssertionError:
 | 
				
			||||||
        s = _HTMLTextExtractor()
 | 
					        s = HTMLTextExtractor()
 | 
				
			||||||
        s.feed(escape(html_str, quote=True))
 | 
					        s.feed(escape(html_str, quote=True))
 | 
				
			||||||
        s.close()
 | 
					        s.close()
 | 
				
			||||||
    except _HTMLTextExtractorException:
 | 
					 | 
				
			||||||
        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
 | 
					 | 
				
			||||||
    return s.get_text()
 | 
					    return s.get_text()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
 | 
				
			|||||||
        self.assertIsNotNone(utils.searxng_useragent())
 | 
					        self.assertIsNotNone(utils.searxng_useragent())
 | 
				
			||||||
        self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
 | 
					        self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_html_to_text(self):
 | 
					 | 
				
			||||||
        html_str = """
 | 
					 | 
				
			||||||
        <a href="/testlink" class="link_access_account">
 | 
					 | 
				
			||||||
            <style>
 | 
					 | 
				
			||||||
                .toto {
 | 
					 | 
				
			||||||
                    color: red;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            </style>
 | 
					 | 
				
			||||||
            <span class="toto">
 | 
					 | 
				
			||||||
                <span>
 | 
					 | 
				
			||||||
                    <img src="test.jpg" />
 | 
					 | 
				
			||||||
                </span>
 | 
					 | 
				
			||||||
            </span>
 | 
					 | 
				
			||||||
            <span class="titi">
 | 
					 | 
				
			||||||
                            Test text
 | 
					 | 
				
			||||||
            </span>
 | 
					 | 
				
			||||||
            <script>value='dummy';</script>
 | 
					 | 
				
			||||||
        </a>
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        self.assertIsInstance(utils.html_to_text(html_str), str)
 | 
					 | 
				
			||||||
        self.assertIsNotNone(utils.html_to_text(html_str))
 | 
					 | 
				
			||||||
        self.assertEqual(utils.html_to_text(html_str), "Test text")
 | 
					 | 
				
			||||||
        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def test_extract_text(self):
 | 
					    def test_extract_text(self):
 | 
				
			||||||
        html_str = """
 | 
					        html_str = """
 | 
				
			||||||
        <a href="/testlink" class="link_access_account">
 | 
					        <a href="/testlink" class="link_access_account">
 | 
				
			||||||
@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
 | 
				
			|||||||
        with self.assertRaises(Exception):
 | 
					        with self.assertRaises(Exception):
 | 
				
			||||||
            utils.extract_url([], 'https://example.com')
 | 
					            utils.extract_url([], 'https://example.com')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_html_to_text_invalid(self):
 | 
					 | 
				
			||||||
        _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
 | 
					 | 
				
			||||||
        self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def test_ecma_unscape(self):
 | 
					    def test_ecma_unscape(self):
 | 
				
			||||||
        self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
 | 
					        self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
 | 
				
			||||||
        self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
 | 
					        self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
 | 
				
			||||||
        self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
 | 
					        self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
class TestHTMLTextExtractor(SearxTestCase):  # pylint: disable=missing-class-docstring
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def setUp(self):
 | 
					 | 
				
			||||||
        super().setUp()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.html_text_extractor = utils._HTMLTextExtractor()  # pylint: disable=protected-access
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def test__init__(self):
 | 
					 | 
				
			||||||
        self.assertEqual(self.html_text_extractor.result, [])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @parameterized.expand(
 | 
					    @parameterized.expand(
 | 
				
			||||||
        [
 | 
					        [
 | 
				
			||||||
            ('xF', '\x0f'),
 | 
					            ('Example <span id="42">#2</span>', 'Example #2'),
 | 
				
			||||||
            ('XF', '\x0f'),
 | 
					            ('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
 | 
				
			||||||
            ('97', 'a'),
 | 
					            (r'regexp: (?<![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
 | 
				
			||||||
 | 
					            (r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
 | 
				
			||||||
 | 
					            (r'> < a', '> < a'),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    def test_handle_charref(self, charref: str, expected: str):
 | 
					    def test_html_to_text(self, html_str: str, text_str: str):
 | 
				
			||||||
        self.html_text_extractor.handle_charref(charref)
 | 
					        self.assertEqual(utils.html_to_text(html_str), text_str)
 | 
				
			||||||
        self.assertIn(expected, self.html_text_extractor.result)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_handle_entityref(self):
 | 
					    def test_html_to_text_with_a_style_span(self):
 | 
				
			||||||
        entity = 'test'
 | 
					        html_str = """
 | 
				
			||||||
        self.html_text_extractor.handle_entityref(entity)
 | 
					        <a href="/testlink" class="link_access_account">
 | 
				
			||||||
        self.assertIn(entity, self.html_text_extractor.result)
 | 
					            <style>
 | 
				
			||||||
 | 
					                .toto {
 | 
				
			||||||
    def test_invalid_html(self):
 | 
					                    color: red;
 | 
				
			||||||
        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
 | 
					                }
 | 
				
			||||||
        with self.assertRaises(utils._HTMLTextExtractorException):  # pylint: disable=protected-access
 | 
					            </style>
 | 
				
			||||||
            self.html_text_extractor.feed(text)
 | 
					            <span class="toto">
 | 
				
			||||||
 | 
					                <span>
 | 
				
			||||||
 | 
					                    <img src="test.jpg" />
 | 
				
			||||||
 | 
					                </span>
 | 
				
			||||||
 | 
					            </span>
 | 
				
			||||||
 | 
					            <span class="titi">
 | 
				
			||||||
 | 
					                            Test text
 | 
				
			||||||
 | 
					            </span>
 | 
				
			||||||
 | 
					            <script>value='dummy';</script>
 | 
				
			||||||
 | 
					        </a>
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        self.assertIsInstance(utils.html_to_text(html_str), str)
 | 
				
			||||||
 | 
					        self.assertEqual(utils.html_to_text(html_str), "Test text")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring
 | 
					class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user