mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-25 15:52:31 -04:00 
			
		
		
		
	
						commit
						678b87f9d5
					
				
							
								
								
									
										83
									
								
								searx/engines/ina.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										83
									
								
								searx/engines/ina.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,83 @@ | ||||
| #  INA (Videos) | ||||
| # | ||||
| # @website     https://www.ina.fr/ | ||||
| # @provide-api no | ||||
| # | ||||
| # @using-api   no | ||||
| # @results     HTML (using search portal) | ||||
| # @stable      no (HTML can change) | ||||
| # @parse       url, title, content, publishedDate, thumbnail | ||||
| # | ||||
| # @todo        set content-parameter with correct data | ||||
| # @todo        embedded (needs some md5 from video page) | ||||
| 
 | ||||
| from json import loads | ||||
| from urllib import urlencode | ||||
| from lxml import html | ||||
| from HTMLParser import HTMLParser | ||||
| from searx.engines.xpath import extract_text | ||||
| from dateutil import parser | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['videos'] | ||||
| paging = True | ||||
| page_size = 48 | ||||
| 
 | ||||
| # search-url | ||||
| base_url = 'https://www.ina.fr' | ||||
| search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' | ||||
| 
 | ||||
| # specific xpath variables | ||||
| results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' | ||||
| url_xpath = './/a/@href' | ||||
| title_xpath = './/h3[@class="h3--title media-heading"]' | ||||
| thumbnail_xpath = './/img/@src' | ||||
| publishedDate_xpath = './/span[@class="broadcast"]' | ||||
| content_xpath = './/p[@class="media-body__summary"]' | ||||
| 
 | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     params['url'] = search_url.format(ps=page_size, | ||||
|                                       start=params['pageno'] * page_size, | ||||
|                                       query=urlencode({'q': query})) | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     # we get html in a JSON container... | ||||
|     response = loads(resp.text) | ||||
|     if "content" not in response: | ||||
|         return [] | ||||
|     dom = html.fromstring(response["content"]) | ||||
|     p = HTMLParser() | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in dom.xpath(results_xpath): | ||||
|         videoid = result.xpath(url_xpath)[0] | ||||
|         url = base_url + videoid | ||||
|         title = p.unescape(extract_text(result.xpath(title_xpath))) | ||||
|         thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) | ||||
|         if thumbnail[0] == '/': | ||||
|             thumbnail = base_url + thumbnail | ||||
|         d = extract_text(result.xpath(publishedDate_xpath)[0]) | ||||
|         d = d.split('/') | ||||
|         # force ISO date to avoid wrong parsing | ||||
|         d = "%s-%s-%s" % (d[2], d[1], d[0]) | ||||
|         publishedDate = parser.parse(d) | ||||
|         content = extract_text(result.xpath(content_xpath)) | ||||
| 
 | ||||
|         # append result | ||||
|         results.append({'url': url, | ||||
|                         'title': title, | ||||
|                         'content': content, | ||||
|                         'template': 'videos.html', | ||||
|                         'publishedDate': publishedDate, | ||||
|                         'thumbnail': thumbnail}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
| @ -254,6 +254,12 @@ engines: | ||||
|     disabled : True | ||||
|     shortcut : habr | ||||
| 
 | ||||
|   - name : ina | ||||
|     engine : ina | ||||
|     shortcut : in | ||||
|     timeout : 6.0 | ||||
|     disabled : True | ||||
| 
 | ||||
|   - name : mixcloud | ||||
|     engine : mixcloud | ||||
|     shortcut : mc | ||||
|  | ||||
							
								
								
									
										64
									
								
								tests/unit/engines/test_ina.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								tests/unit/engines/test_ina.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,64 @@ | ||||
| from collections import defaultdict | ||||
| import mock | ||||
| from searx.engines import ina | ||||
| from searx.testing import SearxTestCase | ||||
| 
 | ||||
| 
 | ||||
| class TestInaEngine(SearxTestCase): | ||||
| 
 | ||||
|     def test_request(self): | ||||
|         query = 'test_query' | ||||
|         dicto = defaultdict(dict) | ||||
|         dicto['pageno'] = 0 | ||||
|         params = ina.request(query, dicto) | ||||
|         self.assertTrue('url' in params) | ||||
|         self.assertTrue(query in params['url']) | ||||
|         self.assertTrue('ina.fr' in params['url']) | ||||
| 
 | ||||
|     def test_response(self): | ||||
|         self.assertRaises(AttributeError, ina.response, None) | ||||
|         self.assertRaises(AttributeError, ina.response, []) | ||||
|         self.assertRaises(AttributeError, ina.response, '') | ||||
|         self.assertRaises(AttributeError, ina.response, '[]') | ||||
| 
 | ||||
|         response = mock.Mock(text='{}') | ||||
|         self.assertEqual(ina.response(response), []) | ||||
| 
 | ||||
|         response = mock.Mock(text='{"data": []}') | ||||
|         self.assertEqual(ina.response(response), []) | ||||
| 
 | ||||
|         json = """ | ||||
|         {"content":"\\t<div class=\\"container\\">\\n\\t\\n\ | ||||
|         <!-- DEBUT CONTENU PRINCIPAL -->\\n<div class=\\"row\\">\\n\ | ||||
|         <div class=\\"search-results--list\\"><div class=\\"media\\">\\n\ | ||||
|         \\t\\t\\t\\t<a class=\\"media-left  media-video  premium    xiti_click_action\\" \ | ||||
|         data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ | ||||
|         href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\\n\ | ||||
|         <img src=\\"https:\/\/www.ina.fr\/images_v2\/140x105\/CAF89035682.jpeg\\" \ | ||||
|         alt=\\"Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle \\">\\n\ | ||||
|         \\t\\t\\t\\t\\t<\/a>\\n\ | ||||
|         \\t\\t\\t\\t\\t<div class=\\"media-body\\">\\n\\t\\t\\t\\t\\t\\t<h3 class=\\"h3--title media-heading\\">\\n\ | ||||
|         \\t\\t\\t\\t\\t\\t\\t<a class=\\"xiti_click_action\\" \ | ||||
|         data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ | ||||
|         href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\ | ||||
|         Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle <\/a>\\n\ | ||||
|         <\/h3>\\n\ | ||||
|         <div class=\\"media-body__info\\">\\n<span class=\\"broadcast\\">27\/11\/1967<\/span>\\n\ | ||||
|         <span class=\\"views\\">29321 vues<\/span>\\n\ | ||||
|         <span class=\\"duration\\">01h 33m 07s<\/span>\\n\ | ||||
|         <\/div>\\n\ | ||||
|         <p class=\\"media-body__summary\\">VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE . \ | ||||
|               - PA le Pr\\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...<\/p>\\n\ | ||||
|         <\/div>\\n<\/div><!-- \/.media -->\\n" | ||||
|         } | ||||
|         """ | ||||
|         response = mock.Mock(text=json) | ||||
|         results = ina.response(response) | ||||
|         self.assertEqual(type(results), list) | ||||
|         self.assertEqual(len(results), 1) | ||||
|         self.assertEqual(results[0]['title'], u'Conf\xe9rence de presse du G\xe9n\xe9ral de Gaulle') | ||||
|         self.assertEqual(results[0]['url'], | ||||
|                          'https://www.ina.fr/video/CAF89035682/conference-de-presse-du-general-de-gaulle-video.html') | ||||
|         self.assertEqual(results[0]['content'], | ||||
|                          u"VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE ." | ||||
|                          u" - PA le Pr\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...") | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user