mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-28 01:02:30 -04:00 
			
		
		
		
	Merge pull request #588 from a01200356/wikidata
[enh] More data from Wikidata
This commit is contained in:
		
						commit
						f1262ffa9e
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,4 +1,5 @@ | ||||
| .coverage | ||||
| coverage/ | ||||
| .installed.cfg | ||||
| engines.cfg | ||||
| env | ||||
|  | ||||
| @ -1,56 +1,86 @@ | ||||
| import json | ||||
| # -*- coding: utf-8 -*- | ||||
| """ | ||||
|  Wikidata | ||||
| 
 | ||||
|  @website     https://wikidata.org | ||||
|  @provide-api yes (https://wikidata.org/w/api.php) | ||||
| 
 | ||||
|  @using-api   partially (most things require scraping) | ||||
|  @results     JSON, HTML | ||||
|  @stable      no (html can change) | ||||
|  @parse       url, infobox | ||||
| """ | ||||
| 
 | ||||
| from searx import logger | ||||
| from searx.poolrequests import get | ||||
| from searx.utils import format_date_by_locale | ||||
| from searx.engines.xpath import extract_text | ||||
| 
 | ||||
| from datetime import datetime | ||||
| from dateutil.parser import parse as dateutil_parse | ||||
| from json import loads | ||||
| from lxml.html import fromstring | ||||
| from urllib import urlencode | ||||
| 
 | ||||
| 
 | ||||
| logger = logger.getChild('wikidata') | ||||
| result_count = 1 | ||||
| 
 | ||||
| # urls | ||||
| wikidata_host = 'https://www.wikidata.org' | ||||
| url_search = wikidata_host \ | ||||
|     + '/wiki/Special:ItemDisambiguation?{query}' | ||||
| 
 | ||||
| wikidata_api = wikidata_host + '/w/api.php' | ||||
| url_search = wikidata_api \ | ||||
|     + '?action=query&list=search&format=json'\ | ||||
|     + '&srnamespace=0&srprop=sectiontitle&{query}' | ||||
| url_detail = wikidata_api\ | ||||
|     + '?action=wbgetentities&format=json'\ | ||||
|     + '&props=labels%7Cinfo%7Csitelinks'\ | ||||
|     + '%7Csitelinks%2Furls%7Cdescriptions%7Cclaims'\ | ||||
|     + '&{query}' | ||||
|     + '?action=parse&format=json&{query}'\ | ||||
|     + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\ | ||||
|     + '&disableeditsection=1&disabletidy=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' | ||||
| 
 | ||||
| url_map = 'https://www.openstreetmap.org/'\ | ||||
|     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' | ||||
| url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' | ||||
| 
 | ||||
| # xpaths | ||||
| wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' | ||||
| title_xpath = '//*[contains(@class,"wikibase-title-label")]' | ||||
| description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' | ||||
| property_xpath = '//div[@id="{propertyid}"]' | ||||
| label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' | ||||
| url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' | ||||
| wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ | ||||
|     + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' | ||||
| property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' | ||||
| preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' | ||||
| value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ | ||||
|     + '/*/div[contains(@class,"wikibase-snakview-value")]' | ||||
| language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' | ||||
| calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' | ||||
| 
 | ||||
| 
 | ||||
| def request(query, params): | ||||
|     language = params['language'].split('_')[0] | ||||
|     if language == 'all': | ||||
|         language = 'en' | ||||
| 
 | ||||
|     params['url'] = url_search.format( | ||||
|         query=urlencode({'srsearch': query, | ||||
|                         'srlimit': result_count})) | ||||
|         query=urlencode({'label': query, | ||||
|                         'language': language})) | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def response(resp): | ||||
|     results = [] | ||||
|     search_res = json.loads(resp.text) | ||||
| 
 | ||||
|     wikidata_ids = set() | ||||
|     for r in search_res.get('query', {}).get('search', {}): | ||||
|         wikidata_ids.add(r.get('title', '')) | ||||
|     html = fromstring(resp.content) | ||||
|     wikidata_ids = html.xpath(wikidata_ids_xpath) | ||||
| 
 | ||||
|     language = resp.search_params['language'].split('_')[0] | ||||
|     if language == 'all': | ||||
|         language = 'en' | ||||
| 
 | ||||
|     url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids), | ||||
|                                             'languages': language + '|en'})) | ||||
| 
 | ||||
|     htmlresponse = get(url) | ||||
|     jsonresponse = json.loads(htmlresponse.content) | ||||
|     for wikidata_id in wikidata_ids: | ||||
|         results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) | ||||
|     # TODO: make requests asynchronous to avoid timeout when result_count > 1 | ||||
|     for wikidata_id in wikidata_ids[:result_count]: | ||||
|         url = url_detail.format(query=urlencode({'page': wikidata_id, | ||||
|                                                 'uselang': language})) | ||||
|         htmlresponse = get(url) | ||||
|         jsonresponse = loads(htmlresponse.content) | ||||
|         results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) | ||||
| 
 | ||||
|     return results | ||||
| 
 | ||||
| @ -60,124 +90,206 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
|     urls = [] | ||||
|     attributes = [] | ||||
| 
 | ||||
|     result = jsonresponse.get('entities', {}).get(wikidata_id, {}) | ||||
|     title = jsonresponse.get('parse', {}).get('displaytitle', {}) | ||||
|     result = jsonresponse.get('parse', {}).get('text', {}) | ||||
| 
 | ||||
|     title = result.get('labels', {}).get(language, {}).get('value', None) | ||||
|     if title is None: | ||||
|         title = result.get('labels', {}).get('en', {}).get('value', None) | ||||
|     if title is None: | ||||
|     if not title or not result: | ||||
|         return results | ||||
| 
 | ||||
|     description = result\ | ||||
|         .get('descriptions', {})\ | ||||
|         .get(language, {})\ | ||||
|         .get('value', None) | ||||
|     title = fromstring(title) | ||||
|     for elem in title.xpath(language_fallback_xpath): | ||||
|         elem.getparent().remove(elem) | ||||
|     title = extract_text(title.xpath(title_xpath)) | ||||
| 
 | ||||
|     if description is None: | ||||
|         description = result\ | ||||
|             .get('descriptions', {})\ | ||||
|             .get('en', {})\ | ||||
|             .get('value', '') | ||||
|     result = fromstring(result) | ||||
|     for elem in result.xpath(language_fallback_xpath): | ||||
|         elem.getparent().remove(elem) | ||||
| 
 | ||||
|     claims = result.get('claims', {}) | ||||
|     official_website = get_string(claims, 'P856', None) | ||||
|     if official_website is not None: | ||||
|         urls.append({'title': 'Official site', 'url': official_website}) | ||||
|         results.append({'title': title, 'url': official_website}) | ||||
|     description = extract_text(result.xpath(description_xpath)) | ||||
| 
 | ||||
|     # URLS | ||||
| 
 | ||||
|     # official website | ||||
|     add_url(urls, result, 'P856', results=results) | ||||
| 
 | ||||
|     # wikipedia | ||||
|     wikipedia_link_count = 0 | ||||
|     wikipedia_link = get_wikilink(result, language + 'wiki') | ||||
|     wikipedia_link_count += add_url(urls, | ||||
|                                     'Wikipedia (' + language + ')', | ||||
|                                     wikipedia_link) | ||||
|     if wikipedia_link: | ||||
|         wikipedia_link_count += 1 | ||||
|         urls.append({'title': 'Wikipedia (' + language + ')', | ||||
|                      'url': wikipedia_link}) | ||||
| 
 | ||||
|     if language != 'en': | ||||
|         wikipedia_en_link = get_wikilink(result, 'enwiki') | ||||
|         wikipedia_link_count += add_url(urls, | ||||
|                                         'Wikipedia (en)', | ||||
|                                         wikipedia_en_link) | ||||
|     if wikipedia_link_count == 0: | ||||
|         misc_language = get_wiki_firstlanguage(result, 'wiki') | ||||
|         if misc_language is not None: | ||||
|             add_url(urls, | ||||
|                     'Wikipedia (' + misc_language + ')', | ||||
|                     get_wikilink(result, misc_language + 'wiki')) | ||||
|         if wikipedia_en_link: | ||||
|             wikipedia_link_count += 1 | ||||
|             urls.append({'title': 'Wikipedia (en)', | ||||
|                          'url': wikipedia_en_link}) | ||||
| 
 | ||||
|     if language != 'en': | ||||
|         add_url(urls, | ||||
|                 'Wiki voyage (' + language + ')', | ||||
|                 get_wikilink(result, language + 'wikivoyage')) | ||||
|     # TODO: get_wiki_firstlanguage | ||||
|     # if wikipedia_link_count == 0: | ||||
| 
 | ||||
|     add_url(urls, | ||||
|             'Wiki voyage (en)', | ||||
|             get_wikilink(result, 'enwikivoyage')) | ||||
|     # more wikis | ||||
|     add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') | ||||
|     add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') | ||||
|     add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki') | ||||
| 
 | ||||
|     if language != 'en': | ||||
|         add_url(urls, | ||||
|                 'Wikiquote (' + language + ')', | ||||
|                 get_wikilink(result, language + 'wikiquote')) | ||||
|     add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo') | ||||
| 
 | ||||
|     add_url(urls, | ||||
|             'Wikiquote (en)', | ||||
|             get_wikilink(result, 'enwikiquote')) | ||||
|     # musicbrainz | ||||
|     add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') | ||||
|     add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') | ||||
|     add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') | ||||
|     add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') | ||||
| 
 | ||||
|     add_url(urls, | ||||
|             'Commons wiki', | ||||
|             get_wikilink(result, 'commonswiki')) | ||||
|     # IMDb | ||||
|     add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') | ||||
|     # source code repository | ||||
|     add_url(urls, result, 'P1324') | ||||
|     # blog | ||||
|     add_url(urls, result, 'P1581') | ||||
|     # social media links | ||||
|     add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') | ||||
|     add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') | ||||
|     add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/') | ||||
|     add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/') | ||||
|     add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/') | ||||
| 
 | ||||
|     add_url(urls, | ||||
|             'Location', | ||||
|             get_geolink(claims, 'P625', None)) | ||||
|     urls.append({'title': 'Wikidata', | ||||
|                  'url': 'https://www.wikidata.org/wiki/' | ||||
|                  + wikidata_id + '?uselang=' + language}) | ||||
| 
 | ||||
|     add_url(urls, | ||||
|             'Wikidata', | ||||
|             'https://www.wikidata.org/wiki/' | ||||
|             + wikidata_id + '?uselang=' + language) | ||||
|     # INFOBOX ATTRIBUTES (ROWS) | ||||
| 
 | ||||
|     musicbrainz_work_id = get_string(claims, 'P435') | ||||
|     if musicbrainz_work_id is not None: | ||||
|         add_url(urls, | ||||
|                 'MusicBrainz', | ||||
|                 'http://musicbrainz.org/work/' | ||||
|                 + musicbrainz_work_id) | ||||
|     # DATES | ||||
|     # inception date | ||||
|     add_attribute(attributes, result, 'P571', date=True) | ||||
|     # dissolution date | ||||
|     add_attribute(attributes, result, 'P576', date=True) | ||||
|     # start date | ||||
|     add_attribute(attributes, result, 'P580', date=True) | ||||
|     # end date | ||||
|     add_attribute(attributes, result, 'P582', date=True) | ||||
|     # date of birth | ||||
|     add_attribute(attributes, result, 'P569', date=True) | ||||
|     # date of death | ||||
|     add_attribute(attributes, result, 'P570', date=True) | ||||
|     # date of spacecraft launch | ||||
|     add_attribute(attributes, result, 'P619', date=True) | ||||
|     # date of spacecraft landing | ||||
|     add_attribute(attributes, result, 'P620', date=True) | ||||
| 
 | ||||
|     musicbrainz_artist_id = get_string(claims, 'P434') | ||||
|     if musicbrainz_artist_id is not None: | ||||
|         add_url(urls, | ||||
|                 'MusicBrainz', | ||||
|                 'http://musicbrainz.org/artist/' | ||||
|                 + musicbrainz_artist_id) | ||||
|     # nationality | ||||
|     add_attribute(attributes, result, 'P27') | ||||
|     # country of origin | ||||
|     add_attribute(attributes, result, 'P495') | ||||
|     # country | ||||
|     add_attribute(attributes, result, 'P17') | ||||
|     # headquarters | ||||
|     add_attribute(attributes, result, 'Q180') | ||||
| 
 | ||||
|     musicbrainz_release_group_id = get_string(claims, 'P436') | ||||
|     if musicbrainz_release_group_id is not None: | ||||
|         add_url(urls, | ||||
|                 'MusicBrainz', | ||||
|                 'http://musicbrainz.org/release-group/' | ||||
|                 + musicbrainz_release_group_id) | ||||
|     # PLACES | ||||
|     # capital | ||||
|     add_attribute(attributes, result, 'P36', trim=True) | ||||
|     # head of state | ||||
|     add_attribute(attributes, result, 'P35', trim=True) | ||||
|     # head of government | ||||
|     add_attribute(attributes, result, 'P6', trim=True) | ||||
|     # type of government | ||||
|     add_attribute(attributes, result, 'P122') | ||||
|     # official language | ||||
|     add_attribute(attributes, result, 'P37') | ||||
|     # population | ||||
|     add_attribute(attributes, result, 'P1082', trim=True) | ||||
|     # area | ||||
|     add_attribute(attributes, result, 'P2046') | ||||
|     # currency | ||||
|     add_attribute(attributes, result, 'P38', trim=True) | ||||
|     # heigth (building) | ||||
|     add_attribute(attributes, result, 'P2048') | ||||
| 
 | ||||
|     musicbrainz_label_id = get_string(claims, 'P966') | ||||
|     if musicbrainz_label_id is not None: | ||||
|         add_url(urls, | ||||
|                 'MusicBrainz', | ||||
|                 'http://musicbrainz.org/label/' | ||||
|                 + musicbrainz_label_id) | ||||
|     # MEDIA | ||||
|     # platform (videogames) | ||||
|     add_attribute(attributes, result, 'P400') | ||||
|     # author | ||||
|     add_attribute(attributes, result, 'P50') | ||||
|     # creator | ||||
|     add_attribute(attributes, result, 'P170') | ||||
|     # director | ||||
|     add_attribute(attributes, result, 'P57') | ||||
|     # performer | ||||
|     add_attribute(attributes, result, 'P175') | ||||
|     # developer | ||||
|     add_attribute(attributes, result, 'P178') | ||||
|     # producer | ||||
|     add_attribute(attributes, result, 'P162') | ||||
|     # manufacturer | ||||
|     add_attribute(attributes, result, 'P176') | ||||
|     # screenwriter | ||||
|     add_attribute(attributes, result, 'P58') | ||||
|     # production company | ||||
|     add_attribute(attributes, result, 'P272') | ||||
|     # record label | ||||
|     add_attribute(attributes, result, 'P264') | ||||
|     # publisher | ||||
|     add_attribute(attributes, result, 'P123') | ||||
|     # original network | ||||
|     add_attribute(attributes, result, 'P449') | ||||
|     # distributor | ||||
|     add_attribute(attributes, result, 'P750') | ||||
|     # composer | ||||
|     add_attribute(attributes, result, 'P86') | ||||
|     # publication date | ||||
|     add_attribute(attributes, result, 'P577', date=True) | ||||
|     # genre | ||||
|     add_attribute(attributes, result, 'P136') | ||||
|     # original language | ||||
|     add_attribute(attributes, result, 'P364') | ||||
|     # isbn | ||||
|     add_attribute(attributes, result, 'Q33057') | ||||
|     # software license | ||||
|     add_attribute(attributes, result, 'P275') | ||||
|     # programming language | ||||
|     add_attribute(attributes, result, 'P277') | ||||
|     # version | ||||
|     add_attribute(attributes, result, 'P348', trim=True) | ||||
|     # narrative location | ||||
|     add_attribute(attributes, result, 'P840') | ||||
| 
 | ||||
|     # musicbrainz_area_id = get_string(claims, 'P982') | ||||
|     # P1407 MusicBrainz series ID | ||||
|     # P1004 MusicBrainz place ID | ||||
|     # P1330 MusicBrainz instrument ID | ||||
|     # P1407 MusicBrainz series ID | ||||
|     # LANGUAGES | ||||
|     # number of speakers | ||||
|     add_attribute(attributes, result, 'P1098') | ||||
|     # writing system | ||||
|     add_attribute(attributes, result, 'P282') | ||||
|     # regulatory body | ||||
|     add_attribute(attributes, result, 'P1018') | ||||
|     # language code | ||||
|     add_attribute(attributes, result, 'P218') | ||||
| 
 | ||||
|     postal_code = get_string(claims, 'P281', None) | ||||
|     if postal_code is not None: | ||||
|         attributes.append({'label': 'Postal code(s)', 'value': postal_code}) | ||||
|     # OTHER | ||||
|     # ceo | ||||
|     add_attribute(attributes, result, 'P169', trim=True) | ||||
|     # founder | ||||
|     add_attribute(attributes, result, 'P112') | ||||
|     # legal form (company/organization) | ||||
|     add_attribute(attributes, result, 'P1454') | ||||
|     # operator | ||||
|     add_attribute(attributes, result, 'P137') | ||||
|     # crew members (tripulation) | ||||
|     add_attribute(attributes, result, 'P1029') | ||||
|     # taxon | ||||
|     add_attribute(attributes, result, 'P225') | ||||
|     # chemical formula | ||||
|     add_attribute(attributes, result, 'P274') | ||||
|     # winner (sports/contests) | ||||
|     add_attribute(attributes, result, 'P1346') | ||||
|     # number of deaths | ||||
|     add_attribute(attributes, result, 'P1120') | ||||
|     # currency code | ||||
|     add_attribute(attributes, result, 'P498') | ||||
| 
 | ||||
|     date_of_birth = get_time(claims, 'P569', locale, None) | ||||
|     if date_of_birth is not None: | ||||
|         attributes.append({'label': 'Date of birth', 'value': date_of_birth}) | ||||
| 
 | ||||
|     date_of_death = get_time(claims, 'P570', locale, None) | ||||
|     if date_of_death is not None: | ||||
|         attributes.append({'label': 'Date of death', 'value': date_of_death}) | ||||
|     image = add_image(result) | ||||
| 
 | ||||
|     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: | ||||
|         results.append({ | ||||
| @ -190,6 +302,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
|                        'infobox': title, | ||||
|                        'id': wikipedia_link, | ||||
|                        'content': description, | ||||
|                        'img_src': image, | ||||
|                        'attributes': attributes, | ||||
|                        'urls': urls | ||||
|                        }) | ||||
| @ -197,92 +310,151 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
|     return results | ||||
| 
 | ||||
| 
 | ||||
| def add_url(urls, title, url): | ||||
|     if url is not None: | ||||
|         urls.append({'title': title, 'url': url}) | ||||
|         return 1 | ||||
| # only returns first match | ||||
| def add_image(result): | ||||
|     # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon | ||||
|     property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] | ||||
| 
 | ||||
|     for property_id in property_ids: | ||||
|         image = result.xpath(property_xpath.replace('{propertyid}', property_id)) | ||||
|         if image: | ||||
|             image_name = image[0].xpath(value_xpath) | ||||
|             image_src = url_image.replace('{filename}', extract_text(image_name[0])) | ||||
|             return image_src | ||||
| 
 | ||||
| 
 | ||||
| # setting trim will only returned high ranked rows OR the first row | ||||
| def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False): | ||||
|     attribute = result.xpath(property_xpath.replace('{propertyid}', property_id)) | ||||
|     if attribute: | ||||
| 
 | ||||
|         if default_label: | ||||
|             label = default_label | ||||
|         else: | ||||
|             label = extract_text(attribute[0].xpath(label_xpath)) | ||||
|             label = label[0].upper() + label[1:] | ||||
| 
 | ||||
|         if date: | ||||
|             trim = True | ||||
|             # remove calendar name | ||||
|             calendar_name = attribute[0].xpath(calendar_name_xpath) | ||||
|             for calendar in calendar_name: | ||||
|                 calendar.getparent().remove(calendar) | ||||
| 
 | ||||
|         concat_values = "" | ||||
|         values = [] | ||||
|         first_value = None | ||||
|         for row in attribute[0].xpath(property_row_xpath): | ||||
|             if not first_value or not trim or row.xpath(preferred_rank_xpath): | ||||
| 
 | ||||
|                 value = row.xpath(value_xpath) | ||||
|                 if not value: | ||||
|                     continue | ||||
|                 value = extract_text(value) | ||||
| 
 | ||||
|                 # save first value in case no ranked row is found | ||||
|                 if trim and not first_value: | ||||
|                     first_value = value | ||||
|                 else: | ||||
|                     # to avoid duplicate values | ||||
|                     if value not in values: | ||||
|                         concat_values += value + ", " | ||||
|                         values.append(value) | ||||
| 
 | ||||
|         if trim and not values: | ||||
|             attributes.append({'label': label, | ||||
|                                'value': first_value}) | ||||
|         else: | ||||
|             attributes.append({'label': label, | ||||
|                                'value': concat_values[:-2]}) | ||||
| 
 | ||||
| 
 | ||||
| # requires property_id unless it's a wiki link (defined in link_type) | ||||
| def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None): | ||||
|     links = [] | ||||
| 
 | ||||
|     # wiki links don't have property in wikidata page | ||||
|     if link_type and 'wiki' in link_type: | ||||
|             links.append(get_wikilink(result, link_type)) | ||||
|     else: | ||||
|         return 0 | ||||
|         dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id)) | ||||
|         if dom_element: | ||||
|             dom_element = dom_element[0] | ||||
|             if not default_label: | ||||
|                 label = extract_text(dom_element.xpath(label_xpath)) | ||||
|                 label = label[0].upper() + label[1:] | ||||
| 
 | ||||
|             if link_type == 'geo': | ||||
|                 links.append(get_geolink(dom_element)) | ||||
| 
 | ||||
|             elif link_type == 'imdb': | ||||
|                 links.append(get_imdblink(dom_element, url_prefix)) | ||||
| 
 | ||||
|             else: | ||||
|                 url_results = dom_element.xpath(url_xpath) | ||||
|                 for link in url_results: | ||||
|                     if link is not None: | ||||
|                         if url_prefix: | ||||
|                             link = url_prefix + extract_text(link) | ||||
|                         else: | ||||
|                             link = extract_text(link) | ||||
|                         links.append(link) | ||||
| 
 | ||||
|     # append urls | ||||
|     for url in links: | ||||
|         if url is not None: | ||||
|             urls.append({'title': default_label or label, | ||||
|                          'url': url}) | ||||
|             if results is not None: | ||||
|                 results.append({'title': default_label or label, | ||||
|                                 'url': url}) | ||||
| 
 | ||||
| 
 | ||||
| def get_mainsnak(claims, propertyName): | ||||
|     propValue = claims.get(propertyName, {}) | ||||
|     if len(propValue) == 0: | ||||
| def get_imdblink(result, url_prefix): | ||||
|     imdb_id = result.xpath(value_xpath) | ||||
|     if imdb_id: | ||||
|         imdb_id = extract_text(imdb_id) | ||||
|         id_prefix = imdb_id[:2] | ||||
|         if id_prefix == 'tt': | ||||
|             url = url_prefix + 'title/' + imdb_id | ||||
|         elif id_prefix == 'nm': | ||||
|             url = url_prefix + 'name/' + imdb_id | ||||
|         elif id_prefix == 'ch': | ||||
|             url = url_prefix + 'character/' + imdb_id | ||||
|         elif id_prefix == 'co': | ||||
|             url = url_prefix + 'company/' + imdb_id | ||||
|         elif id_prefix == 'ev': | ||||
|             url = url_prefix + 'event/' + imdb_id | ||||
|         else: | ||||
|             url = None | ||||
|         return url | ||||
| 
 | ||||
| 
 | ||||
| def get_geolink(result): | ||||
|     coordinates = result.xpath(value_xpath) | ||||
|     if not coordinates: | ||||
|         return None | ||||
|     coordinates = extract_text(coordinates[0]) | ||||
|     latitude, longitude = coordinates.split(',') | ||||
| 
 | ||||
|     propValue = propValue[0].get('mainsnak', None) | ||||
|     return propValue | ||||
| 
 | ||||
| 
 | ||||
| def get_string(claims, propertyName, defaultValue=None): | ||||
|     propValue = claims.get(propertyName, {}) | ||||
|     if len(propValue) == 0: | ||||
|         return defaultValue | ||||
| 
 | ||||
|     result = [] | ||||
|     for e in propValue: | ||||
|         mainsnak = e.get('mainsnak', {}) | ||||
| 
 | ||||
|         datavalue = mainsnak.get('datavalue', {}) | ||||
|         if datavalue is not None: | ||||
|             result.append(datavalue.get('value', '')) | ||||
| 
 | ||||
|     if len(result) == 0: | ||||
|         return defaultValue | ||||
|     else: | ||||
|         # TODO handle multiple urls | ||||
|         return result[0] | ||||
| 
 | ||||
| 
 | ||||
| def get_time(claims, propertyName, locale, defaultValue=None): | ||||
|     propValue = claims.get(propertyName, {}) | ||||
|     if len(propValue) == 0: | ||||
|         return defaultValue | ||||
| 
 | ||||
|     result = [] | ||||
|     for e in propValue: | ||||
|         mainsnak = e.get('mainsnak', {}) | ||||
| 
 | ||||
|         datavalue = mainsnak.get('datavalue', {}) | ||||
|         if datavalue is not None: | ||||
|             value = datavalue.get('value', '') | ||||
|             result.append(value.get('time', '')) | ||||
| 
 | ||||
|     if len(result) == 0: | ||||
|         date_string = defaultValue | ||||
|     else: | ||||
|         date_string = ', '.join(result) | ||||
| 
 | ||||
|     try: | ||||
|         parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ") | ||||
|     except: | ||||
|         if date_string.startswith('-'): | ||||
|             return date_string.split('T')[0] | ||||
|         try: | ||||
|             parsed_date = dateutil_parse(date_string, fuzzy=False, default=False) | ||||
|         except: | ||||
|             logger.debug('could not parse date %s', date_string) | ||||
|             return date_string.split('T')[0] | ||||
| 
 | ||||
|     return format_date_by_locale(parsed_date, locale) | ||||
| 
 | ||||
| 
 | ||||
| def get_geolink(claims, propertyName, defaultValue=''): | ||||
|     mainsnak = get_mainsnak(claims, propertyName) | ||||
| 
 | ||||
|     if mainsnak is None: | ||||
|         return defaultValue | ||||
| 
 | ||||
|     datatype = mainsnak.get('datatype', '') | ||||
|     datavalue = mainsnak.get('datavalue', {}) | ||||
| 
 | ||||
|     if datatype != 'globe-coordinate': | ||||
|         return defaultValue | ||||
| 
 | ||||
|     value = datavalue.get('value', {}) | ||||
| 
 | ||||
|     precision = value.get('precision', 0.0002) | ||||
|     # convert to decimal | ||||
|     lat = int(latitude[:latitude.find(u'°')]) | ||||
|     if latitude.find('\'') >= 0: | ||||
|         lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0 | ||||
|     if latitude.find('"') >= 0: | ||||
|         lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 | ||||
|     if latitude.find('S') >= 0: | ||||
|         lat *= -1 | ||||
|     lon = int(longitude[:longitude.find(u'°')]) | ||||
|     if longitude.find('\'') >= 0: | ||||
|         lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0 | ||||
|     if longitude.find('"') >= 0: | ||||
|         lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 | ||||
|     if longitude.find('W') >= 0: | ||||
|         lon *= -1 | ||||
| 
 | ||||
|     # TODO: get precision | ||||
|     precision = 0.0002 | ||||
|     # there is no zoom information, deduce from precision (error prone) | ||||
|     # samples : | ||||
|     # 13 --> 5 | ||||
| @ -298,26 +470,20 @@ def get_geolink(claims, propertyName, defaultValue=''): | ||||
|         zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) | ||||
| 
 | ||||
|     url = url_map\ | ||||
|         .replace('{latitude}', str(value.get('latitude', 0)))\ | ||||
|         .replace('{longitude}', str(value.get('longitude', 0)))\ | ||||
|         .replace('{latitude}', str(lat))\ | ||||
|         .replace('{longitude}', str(lon))\ | ||||
|         .replace('{zoom}', str(zoom)) | ||||
| 
 | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def get_wikilink(result, wikiid): | ||||
|     url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None) | ||||
|     if url is None: | ||||
|         return url | ||||
|     elif url.startswith('http://'): | ||||
|     url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid)) | ||||
|     if not url: | ||||
|         return None | ||||
|     url = url[0] | ||||
|     if url.startswith('http://'): | ||||
|         url = url.replace('http://', 'https://') | ||||
|     elif url.startswith('//'): | ||||
|         url = 'https:' + url | ||||
|     return url | ||||
| 
 | ||||
| 
 | ||||
| def get_wiki_firstlanguage(result, wikipatternid): | ||||
|     for k in result.get('sitelinks', {}).keys(): | ||||
|         if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)): | ||||
|             return k[0:2] | ||||
|     return None | ||||
|  | ||||
| @ -99,9 +99,8 @@ def response(resp): | ||||
|         return [] | ||||
| 
 | ||||
|     # link to wikipedia article | ||||
|     # parenthesis are not quoted to make infobox mergeable with wikidata's | ||||
|     wikipedia_link = url_lang(resp.search_params['language']) \ | ||||
|         + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')') | ||||
|         + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) | ||||
| 
 | ||||
|     results.append({'url': wikipedia_link, 'title': title}) | ||||
| 
 | ||||
|  | ||||
| @ -18,7 +18,17 @@ def result_content_len(content): | ||||
| 
 | ||||
| 
 | ||||
| def compare_urls(url_a, url_b): | ||||
|     if url_a.netloc != url_b.netloc or url_a.query != url_b.query: | ||||
|     # ignore www. in comparison | ||||
|     if url_a.netloc.startswith('www.'): | ||||
|         host_a = url_a.netloc.replace('www.', '', 1) | ||||
|     else: | ||||
|         host_a = url_a.netloc | ||||
|     if url_b.netloc.startswith('www.'): | ||||
|         host_b = url_b.netloc.replace('www.', '', 1) | ||||
|     else: | ||||
|         host_b = url_b.netloc | ||||
| 
 | ||||
|     if host_a != host_b or url_a.query != url_b.query: | ||||
|         return False | ||||
| 
 | ||||
|     # remove / from the end of the url if required | ||||
| @ -33,25 +43,42 @@ def compare_urls(url_a, url_b): | ||||
| 
 | ||||
| 
 | ||||
| def merge_two_infoboxes(infobox1, infobox2): | ||||
|     # get engines weights | ||||
|     if hasattr(engines[infobox1['engine']], 'weight'): | ||||
|         weight1 = engines[infobox1['engine']].weight | ||||
|     else: | ||||
|         weight1 = 1 | ||||
|     if hasattr(engines[infobox2['engine']], 'weight'): | ||||
|         weight2 = engines[infobox2['engine']].weight | ||||
|     else: | ||||
|         weight2 = 1 | ||||
| 
 | ||||
|     if weight2 > weight1: | ||||
|         infobox1['engine'] = infobox2['engine'] | ||||
| 
 | ||||
|     if 'urls' in infobox2: | ||||
|         urls1 = infobox1.get('urls', None) | ||||
|         if urls1 is None: | ||||
|             urls1 = [] | ||||
|             infobox1['urls'] = urls1 | ||||
| 
 | ||||
|         urlSet = set() | ||||
|         for url in infobox1.get('urls', []): | ||||
|             urlSet.add(url.get('url', None)) | ||||
|         for url2 in infobox2.get('urls', []): | ||||
|             unique_url = True | ||||
|             for url1 in infobox1.get('urls', []): | ||||
|                 if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))): | ||||
|                     unique_url = False | ||||
|                     break | ||||
|             if unique_url: | ||||
|                 urls1.append(url2) | ||||
| 
 | ||||
|         for url in infobox2.get('urls', []): | ||||
|             if url.get('url', None) not in urlSet: | ||||
|                 urls1.append(url) | ||||
|         infobox1['urls'] = urls1 | ||||
| 
 | ||||
|     if 'img_src' in infobox2: | ||||
|         img1 = infobox1.get('img_src', None) | ||||
|         img2 = infobox2.get('img_src') | ||||
|         if img1 is None: | ||||
|             infobox1['img_src'] = img2 | ||||
|         elif weight2 > weight1: | ||||
|             infobox1['img_src'] = img2 | ||||
| 
 | ||||
|     if 'attributes' in infobox2: | ||||
|         attributes1 = infobox1.get('attributes', None) | ||||
| @ -65,7 +92,8 @@ def merge_two_infoboxes(infobox1, infobox2): | ||||
|                 attributeSet.add(attribute.get('label', None)) | ||||
| 
 | ||||
|         for attribute in infobox2.get('attributes', []): | ||||
|             attributes1.append(attribute) | ||||
|             if attribute.get('label', None) not in attributeSet: | ||||
|                 attributes1.append(attribute) | ||||
| 
 | ||||
|     if 'content' in infobox2: | ||||
|         content1 = infobox1.get('content', None) | ||||
| @ -97,7 +125,6 @@ class ResultContainer(object): | ||||
|         self.results = defaultdict(list) | ||||
|         self._merged_results = [] | ||||
|         self.infoboxes = [] | ||||
|         self._infobox_ids = {} | ||||
|         self.suggestions = set() | ||||
|         self.answers = set() | ||||
|         self._number_of_results = [] | ||||
| @ -138,14 +165,13 @@ class ResultContainer(object): | ||||
|         add_infobox = True | ||||
|         infobox_id = infobox.get('id', None) | ||||
|         if infobox_id is not None: | ||||
|             existingIndex = self._infobox_ids.get(infobox_id, None) | ||||
|             if existingIndex is not None: | ||||
|                 merge_two_infoboxes(self.infoboxes[existingIndex], infobox) | ||||
|                 add_infobox = False | ||||
|             for existingIndex in self.infoboxes: | ||||
|                 if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)): | ||||
|                     merge_two_infoboxes(existingIndex, infobox) | ||||
|                     add_infobox = False | ||||
| 
 | ||||
|         if add_infobox: | ||||
|             self.infoboxes.append(infobox) | ||||
|             self._infobox_ids[infobox_id] = len(self.infoboxes) - 1 | ||||
| 
 | ||||
|     def _merge_result(self, result, position): | ||||
|         result['parsed_url'] = urlparse(result['url']) | ||||
| @ -155,11 +181,6 @@ class ResultContainer(object): | ||||
|             result['parsed_url'] = result['parsed_url']._replace(scheme="http") | ||||
|             result['url'] = result['parsed_url'].geturl() | ||||
| 
 | ||||
|         result['host'] = result['parsed_url'].netloc | ||||
| 
 | ||||
|         if result['host'].startswith('www.'): | ||||
|             result['host'] = result['host'].replace('www.', '', 1) | ||||
| 
 | ||||
|         result['engines'] = [result['engine']] | ||||
| 
 | ||||
|         # strip multiple spaces and cariage returns from content | ||||
|  | ||||
| @ -105,6 +105,7 @@ engines: | ||||
|   - name : ddg definitions | ||||
|     engine : duckduckgo_definitions | ||||
|     shortcut : ddd | ||||
|     weight : 2 | ||||
|     disabled : True | ||||
| 
 | ||||
|   - name : digg | ||||
| @ -127,6 +128,7 @@ engines: | ||||
|   - name : wikidata | ||||
|     engine : wikidata | ||||
|     shortcut : wd | ||||
|     weight : 2 | ||||
| 
 | ||||
|   - name : duckduckgo | ||||
|     engine : duckduckgo | ||||
|  | ||||
| @ -1,18 +1,18 @@ | ||||
| <div class="infobox"> | ||||
|     <h2>{{ infobox.infobox }}</h2> | ||||
| <h2><bdi>{{ infobox.infobox }}</bdi></h2> | ||||
|     {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %} | ||||
|     <p>{{ infobox.entity }}</p> | ||||
|     <p>{{ infobox.content | safe }}</p> | ||||
|     <p><bdi>{{ infobox.entity }}</bdi></p> | ||||
|     <p><bdi>{{ infobox.content | safe }}</bdi></p> | ||||
|     {% if infobox.attributes %} | ||||
|     <div class="attributes"> | ||||
|         <table> | ||||
|             {% for attribute in infobox.attributes %} | ||||
|             <tr> | ||||
|                 <td>{{ attribute.label }}</td> | ||||
|                 <td><bdi>{{ attribute.label }}</bdi></td> | ||||
|                 {% if attribute.image %} | ||||
|                 <td><img src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> | ||||
|                 {% else %} | ||||
|                 <td>{{ attribute.value }}</td> | ||||
|                 <td><bdi>{{ attribute.value }}</bdi></td> | ||||
|                 {% endif %} | ||||
|             </tr> | ||||
|             {% endfor %} | ||||
| @ -24,7 +24,7 @@ | ||||
|     <div class="urls"> | ||||
|         <ul> | ||||
|             {% for url in infobox.urls %} | ||||
|             <li class="url"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></li> | ||||
|             <li class="url"><bdi><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></bdi></li> | ||||
|             {% endfor %} | ||||
|         </ul> | ||||
|     </div> | ||||
| @ -34,7 +34,7 @@ | ||||
|     <div class="relatedTopics"> | ||||
|         {% for topic in infobox.relatedTopics %} | ||||
|         <div> | ||||
|             <h3>{{ topic.name }}</h3> | ||||
|             <h3><bdi>{{ topic.name }}</bdi></h3> | ||||
|             {% for suggestion in topic.suggestions %} | ||||
|             <form method="{{ method or 'POST' }}" action="{{ url_for('index') }}"> | ||||
|                 <input type="hidden" name="q" value="{{ suggestion }}"> | ||||
|  | ||||
| @ -1,21 +1,20 @@ | ||||
| <div class="panel panel-default infobox"> | ||||
|     <div class="panel-heading"> | ||||
|         <bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi> | ||||
|         <h4 class="panel-title infobox_part"><bdi>{{ infobox.infobox }}</bdi></h4> | ||||
|     </div> | ||||
|     <div class="panel-body"> | ||||
|         <bdi> | ||||
|         {% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %} | ||||
|         {% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %} | ||||
|         {% if infobox.content %}<bdi><p class="infobox_part">{{ infobox.content }}</bdi></p>{% endif %} | ||||
| 
 | ||||
|         {% if infobox.attributes %} | ||||
|         <table class="table table-striped infobox_part"> | ||||
|             {% for attribute in infobox.attributes %} | ||||
|             <tr> | ||||
|                 <td>{{ attribute.label }}</td> | ||||
|                 <td><bdi>{{ attribute.label }}</bdi></td> | ||||
|                 {% if attribute.image %} | ||||
|                 <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> | ||||
|                 {% else %} | ||||
|                 <td>{{ attribute.value }}</td> | ||||
|                 <td><bdi>{{ attribute.value }}</bdi></td> | ||||
|                 {% endif %} | ||||
|             </tr> | ||||
|             {% endfor %} | ||||
| @ -24,11 +23,12 @@ | ||||
| 
 | ||||
|         {% if infobox.urls %} | ||||
|         <div class="infobox_part"> | ||||
|             <bdi> | ||||
|             {% for url in infobox.urls %} | ||||
|             <p class="btn btn-default btn-xs"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></p> | ||||
|             {% endfor %} | ||||
|             </bdi> | ||||
|         </div> | ||||
|         {% endif %} | ||||
|         </bdi> | ||||
|     </div> | ||||
| </div> | ||||
|  | ||||
| @ -206,7 +206,13 @@ def format_date_by_locale(date, locale_string): | ||||
|     if locale_string == 'all': | ||||
|         locale_string = settings['ui']['default_locale'] or 'en_US' | ||||
| 
 | ||||
|     return format_date(date, locale=locale_string) | ||||
|     # to avoid crashing if locale is not supported by babel | ||||
|     try: | ||||
|         formatted_date = format_date(date, locale=locale_string) | ||||
|     except: | ||||
|         formatted_date = format_date(date, "YYYY-MM-dd") | ||||
| 
 | ||||
|     return formatted_date | ||||
| 
 | ||||
| 
 | ||||
| def dict_subset(d, properties): | ||||
|  | ||||
							
								
								
									
										504
									
								
								tests/unit/engines/test_wikidata.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										504
									
								
								tests/unit/engines/test_wikidata.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,504 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
| from json import loads | ||||
| from lxml.html import fromstring | ||||
| from collections import defaultdict | ||||
| import mock | ||||
| from searx.engines import wikidata | ||||
| from searx.testing import SearxTestCase | ||||
| 
 | ||||
| 
 | ||||
| class TestWikidataEngine(SearxTestCase): | ||||
| 
 | ||||
|     def test_request(self): | ||||
|         query = 'test_query' | ||||
|         dicto = defaultdict(dict) | ||||
|         dicto['language'] = 'all' | ||||
|         params = wikidata.request(query, dicto) | ||||
|         self.assertIn('url', params) | ||||
|         self.assertIn(query, params['url']) | ||||
|         self.assertIn('wikidata.org', params['url']) | ||||
|         self.assertIn('en', params['url']) | ||||
| 
 | ||||
|         dicto['language'] = 'es_ES' | ||||
|         params = wikidata.request(query, dicto) | ||||
|         self.assertIn(query, params['url']) | ||||
|         self.assertIn('es', params['url']) | ||||
| 
 | ||||
|     # successful cases are not tested here to avoid sending additional requests | ||||
|     def test_response(self): | ||||
|         self.assertRaises(AttributeError, wikidata.response, None) | ||||
|         self.assertRaises(AttributeError, wikidata.response, []) | ||||
|         self.assertRaises(AttributeError, wikidata.response, '') | ||||
|         self.assertRaises(AttributeError, wikidata.response, '[]') | ||||
| 
 | ||||
|         response = mock.Mock(content='<html></html>', search_params={"language": "all"}) | ||||
|         self.assertEqual(wikidata.response(response), []) | ||||
| 
 | ||||
|     def test_getDetail(self): | ||||
|         response = {} | ||||
|         results = wikidata.getDetail(response, "Q123", "en", "en-US") | ||||
|         self.assertEqual(results, []) | ||||
| 
 | ||||
|         title_html = '<div><div class="wikibase-title-label">Test</div></div>' | ||||
|         html = """ | ||||
|         <div> | ||||
|             <div class="wikibase-entitytermsview-heading-description"> | ||||
|             </div> | ||||
|             <div> | ||||
|                 <ul class="wikibase-sitelinklistview-listview"> | ||||
|                     <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> | ||||
|                 </ul> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         response = {"parse": {"displaytitle": title_html, "text": html}} | ||||
| 
 | ||||
|         results = wikidata.getDetail(response, "Q123", "en", "en-US") | ||||
|         self.assertEqual(len(results), 1) | ||||
|         self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test') | ||||
| 
 | ||||
|         title_html = """ | ||||
|         <div> | ||||
|             <div class="wikibase-title-label"> | ||||
|                 <span lang="en">Test</span> | ||||
|                 <sup class="wb-language-fallback-indicator">English</sup> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html = """ | ||||
|         <div> | ||||
|             <div class="wikibase-entitytermsview-heading-description"> | ||||
|                 <span lang="en">Description</span> | ||||
|                 <sup class="wb-language-fallback-indicator">English</sup> | ||||
|             </div> | ||||
|             <div id="P856"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P856"> | ||||
|                         <span lang="en">official website</span> | ||||
|                         <sup class="wb-language-fallback-indicator">English</sup> | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementview-mainsnak"> | ||||
|                     <a class="external free" href="https://officialsite.com"> | ||||
|                         https://officialsite.com | ||||
|                     </a> | ||||
|                 </div> | ||||
|             </div> | ||||
|             <div> | ||||
|                 <ul class="wikibase-sitelinklistview-listview"> | ||||
|                     <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> | ||||
|                 </ul> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         response = {"parse": {"displaytitle": title_html, "text": html}} | ||||
| 
 | ||||
|         results = wikidata.getDetail(response, "Q123", "yua", "yua_MX") | ||||
|         self.assertEqual(len(results), 2) | ||||
|         self.assertEqual(results[0]['title'], 'Official website') | ||||
|         self.assertEqual(results[0]['url'], 'https://officialsite.com') | ||||
| 
 | ||||
|         self.assertEqual(results[1]['infobox'], 'Test') | ||||
|         self.assertEqual(results[1]['id'], None) | ||||
|         self.assertEqual(results[1]['content'], 'Description') | ||||
|         self.assertEqual(results[1]['attributes'], []) | ||||
|         self.assertEqual(results[1]['urls'][0]['title'], 'Official website') | ||||
|         self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com') | ||||
|         self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)') | ||||
|         self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test') | ||||
| 
 | ||||
|     def test_add_image(self): | ||||
|         image_src = wikidata.add_image(fromstring("<div></div>")) | ||||
|         self.assertEqual(image_src, None) | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div id="P18"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P18"> | ||||
|                         image | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-rankselector"> | ||||
|                             <span class="wikibase-rankselector-normal"></span> | ||||
|                         </div> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a href="https://commons.wikimedia.org/wiki/File:image.png"> | ||||
|                                         image.png | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html_etree = fromstring(html) | ||||
| 
 | ||||
|         image_src = wikidata.add_image(html_etree) | ||||
|         self.assertEqual(image_src, | ||||
|                          "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400") | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div id="P2910"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P2910"> | ||||
|                         icon | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-rankselector"> | ||||
|                             <span class="wikibase-rankselector-normal"></span> | ||||
|                         </div> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a href="https://commons.wikimedia.org/wiki/File:icon.png"> | ||||
|                                         icon.png | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|             <div id="P154"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P154"> | ||||
|                         logo | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-rankselector"> | ||||
|                             <span class="wikibase-rankselector-normal"></span> | ||||
|                         </div> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a href="https://commons.wikimedia.org/wiki/File:logo.png"> | ||||
|                                         logo.png | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html_etree = fromstring(html) | ||||
| 
 | ||||
|         image_src = wikidata.add_image(html_etree) | ||||
|         self.assertEqual(image_src, | ||||
|                          "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400") | ||||
| 
 | ||||
|     def test_add_attribute(self): | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div id="P27"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P27"> | ||||
|                         country of citizenship | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-rankselector"> | ||||
|                             <span class="wikibase-rankselector-normal"></span> | ||||
|                         </div> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a href="/wiki/Q145"> | ||||
|                                         United Kingdom | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         attributes = [] | ||||
|         html_etree = fromstring(html) | ||||
| 
 | ||||
|         wikidata.add_attribute(attributes, html_etree, "Fail") | ||||
|         self.assertEqual(attributes, []) | ||||
| 
 | ||||
|         wikidata.add_attribute(attributes, html_etree, "P27") | ||||
|         self.assertEqual(len(attributes), 1) | ||||
|         self.assertEqual(attributes[0]["label"], "Country of citizenship") | ||||
|         self.assertEqual(attributes[0]["value"], "United Kingdom") | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div id="P569"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P569"> | ||||
|                         date of birth | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-rankselector"> | ||||
|                             <span class="wikibase-rankselector-normal"></span> | ||||
|                         </div> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     27 January 1832 | ||||
|                                     <sup class="wb-calendar-name"> | ||||
|                                         Gregorian | ||||
|                                     </sup> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         attributes = [] | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_attribute(attributes, html_etree, "P569", date=True) | ||||
|         self.assertEqual(len(attributes), 1) | ||||
|         self.assertEqual(attributes[0]["label"], "Date of birth") | ||||
|         self.assertEqual(attributes[0]["value"], "27 January 1832") | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div id="P6"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P27"> | ||||
|                         head of government | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-rankselector"> | ||||
|                             <span class="wikibase-rankselector-normal"></span> | ||||
|                         </div> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a href="/wiki/Q206"> | ||||
|                                         Old Prime Minister | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-rankselector"> | ||||
|                             <span class="wikibase-rankselector-preferred"></span> | ||||
|                         </div> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a href="/wiki/Q3099714"> | ||||
|                                         Actual Prime Minister | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         attributes = [] | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_attribute(attributes, html_etree, "P6") | ||||
|         self.assertEqual(len(attributes), 1) | ||||
|         self.assertEqual(attributes[0]["label"], "Head of government") | ||||
|         self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister") | ||||
| 
 | ||||
|         attributes = [] | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_attribute(attributes, html_etree, "P6", trim=True) | ||||
|         self.assertEqual(len(attributes), 1) | ||||
|         self.assertEqual(attributes[0]["value"], "Actual Prime Minister") | ||||
| 
 | ||||
|     def test_add_url(self): | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div id="P856"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P856"> | ||||
|                         official website | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a class="external free" href="https://searx.me"> | ||||
|                                         https://searx.me/ | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         urls = [] | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_url(urls, html_etree, 'P856') | ||||
|         self.assertEquals(len(urls), 1) | ||||
|         self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls) | ||||
|         urls = [] | ||||
|         results = [] | ||||
|         wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results) | ||||
|         self.assertEquals(len(urls), 1) | ||||
|         self.assertEquals(len(results), 1) | ||||
|         self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls) | ||||
|         self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, results) | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div id="P856"> | ||||
|                 <div class="wikibase-statementgroupview-property-label"> | ||||
|                     <a href="/wiki/Property:P856"> | ||||
|                         official website | ||||
|                     </a> | ||||
|                 </div> | ||||
|                 <div class="wikibase-statementlistview"> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a class="external free" href="http://www.worldofwarcraft.com"> | ||||
|                                         http://www.worldofwarcraft.com | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                     <div class="wikibase-statementview listview-item"> | ||||
|                         <div class="wikibase-statementview-mainsnak"> | ||||
|                             <div> | ||||
|                                 <div class="wikibase-snakview-value"> | ||||
|                                     <a class="external free" href="http://eu.battle.net/wow/en/"> | ||||
|                                         http://eu.battle.net/wow/en/ | ||||
|                                     </a> | ||||
|                                 </div> | ||||
|                             </div> | ||||
|                         </div> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         urls = [] | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_url(urls, html_etree, 'P856') | ||||
|         self.assertEquals(len(urls), 2) | ||||
|         self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls) | ||||
|         self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) | ||||
| 
 | ||||
|     def test_get_imdblink(self): | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div class="wikibase-statementview-mainsnak"> | ||||
|                 <div> | ||||
|                     <div class="wikibase-snakview-value"> | ||||
|                         <a class="wb-external-id" href="http://www.imdb.com/tt0433664"> | ||||
|                             tt0433664 | ||||
|                         </a> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html_etree = fromstring(html) | ||||
|         imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div class="wikibase-statementview-mainsnak"> | ||||
|                 <div> | ||||
|                     <div class="wikibase-snakview-value"> | ||||
|                         <a class="wb-external-id" | ||||
|                            href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994""> | ||||
|                             nm4915994 | ||||
|                         </a> | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html_etree = fromstring(html) | ||||
|         imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/') | ||||
|         self.assertIn('https://www.imdb.com/name/nm4915994', imdblink) | ||||
| 
 | ||||
|     def test_get_geolink(self): | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div class="wikibase-statementview-mainsnak"> | ||||
|                 <div> | ||||
|                     <div class="wikibase-snakview-value"> | ||||
|                         60°N, 40°E | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html_etree = fromstring(html) | ||||
|         geolink = wikidata.get_geolink(html_etree) | ||||
|         self.assertIn('https://www.openstreetmap.org/', geolink) | ||||
|         self.assertIn('lat=60&lon=40', geolink) | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
|             <div class="wikibase-statementview-mainsnak"> | ||||
|                 <div> | ||||
|                     <div class="wikibase-snakview-value"> | ||||
|                         34°35'59"S, 58°22'55"W | ||||
|                     </div> | ||||
|                 </div> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html_etree = fromstring(html) | ||||
|         geolink = wikidata.get_geolink(html_etree) | ||||
|         self.assertIn('https://www.openstreetmap.org/', geolink) | ||||
|         self.assertIn('lat=-34.59', geolink) | ||||
|         self.assertIn('lon=-58.38', geolink) | ||||
| 
 | ||||
|     def test_get_wikilink(self): | ||||
|         html = """ | ||||
|         <div> | ||||
|             <div> | ||||
|                 <ul class="wikibase-sitelinklistview-listview"> | ||||
|                     <li data-wb-siteid="arwiki"><a href="http://ar.wikipedia.org/wiki/Test">Test</a></li> | ||||
|                     <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li> | ||||
|                 </ul> | ||||
|             </div> | ||||
|             <div> | ||||
|                 <ul class="wikibase-sitelinklistview-listview"> | ||||
|                     <li data-wb-siteid="enwikiquote"><a href="https://en.wikiquote.org/wiki/Test">Test</a></li> | ||||
|                 </ul> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         html_etree = fromstring(html) | ||||
|         wikilink = wikidata.get_wikilink(html_etree, 'nowiki') | ||||
|         self.assertEqual(wikilink, None) | ||||
|         wikilink = wikidata.get_wikilink(html_etree, 'enwiki') | ||||
|         self.assertEqual(wikilink, 'https://en.wikipedia.org/wiki/Test') | ||||
|         wikilink = wikidata.get_wikilink(html_etree, 'arwiki') | ||||
|         self.assertEqual(wikilink, 'https://ar.wikipedia.org/wiki/Test') | ||||
|         wikilink = wikidata.get_wikilink(html_etree, 'enwikiquote') | ||||
|         self.assertEqual(wikilink, 'https://en.wikiquote.org/wiki/Test') | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user