mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 10:37:06 -04:00 
			
		
		
		
	[mod] implement searx.wikidata_units for unit converters
This commit is contained in:
		
							parent
							
								
									cf59ee2efc
								
							
						
					
					
						commit
						a800dd0473
					
				| @ -15,7 +15,7 @@ import babel.numbers | ||||
| 
 | ||||
| from flask_babel import gettext, get_locale | ||||
| 
 | ||||
| from searx import data | ||||
| from searx.units import symbol_to_si | ||||
| from searx.plugins import Plugin, PluginInfo | ||||
| from searx.result_types import EngineResults | ||||
| 
 | ||||
| @ -86,132 +86,6 @@ RE_MEASURE = r''' | ||||
| ''' | ||||
| 
 | ||||
| 
 | ||||
| ADDITIONAL_UNITS = [ | ||||
|     { | ||||
|         "si_name": "Q11579", | ||||
|         "symbol": "°C", | ||||
|         "to_si": lambda val: val + 273.15, | ||||
|         "from_si": lambda val: val - 273.15, | ||||
|     }, | ||||
|     { | ||||
|         "si_name": "Q11579", | ||||
|         "symbol": "°F", | ||||
|         "to_si": lambda val: (val + 459.67) * 5 / 9, | ||||
|         "from_si": lambda val: (val * 9 / 5) - 459.67, | ||||
|     }, | ||||
| ] | ||||
| """Additional items to convert from a measure unit to a SI unit (vice versa). | ||||
| 
 | ||||
| .. code:: python | ||||
| 
 | ||||
|     { | ||||
|         "si_name": "Q11579",                 # Wikidata item ID of the SI unit (Kelvin) | ||||
|         "symbol": "°C",                      # symbol of the measure unit | ||||
|         "to_si": lambda val: val + 273.15,   # convert measure value (val) to SI unit | ||||
|         "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit | ||||
|     }, | ||||
|     { | ||||
|         "si_name": "Q11573", | ||||
|         "symbol": "mi", | ||||
|         "to_si": 1609.344,                   # convert measure value (val) to SI unit | ||||
|         "from_si": 1 / 1609.344              # convert SI value (val) measure unit | ||||
|     }, | ||||
| 
 | ||||
| The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier) | ||||
| or a callable_ (val in / converted value returned). | ||||
| 
 | ||||
| .. _callable: https://docs.python.org/3/glossary.html#term-callable | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| ALIAS_SYMBOLS = { | ||||
|     '°C': ('C',), | ||||
|     '°F': ('F',), | ||||
|     'mi': ('L',), | ||||
| } | ||||
| """Alias symbols for known unit of measure symbols / by example:: | ||||
| 
 | ||||
|     '°C': ('C', ...),  # list of alias symbols for °C (Q69362731) | ||||
|     '°F': ('F', ...),  # list of alias symbols for °F (Q99490479) | ||||
|     'mi': ('L',),      # list of alias symbols for mi (Q253276) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| SYMBOL_TO_SI = [] | ||||
| 
 | ||||
| 
 | ||||
| def symbol_to_si(): | ||||
|     """Generates a list of tuples, each tuple is a measure unit and the fields | ||||
|     in the tuple are: | ||||
| 
 | ||||
|     0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276) | ||||
| 
 | ||||
|     1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre') | ||||
| 
 | ||||
|     2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m | ||||
|        multiplied by 1609.344) | ||||
| 
 | ||||
|     3. Factor to get measure value from from SI value (e.g. SI 100m is equal to | ||||
|        100mi divided by 1609.344) | ||||
| 
 | ||||
|     The returned list is sorted, the first items are created from | ||||
|     ``WIKIDATA_UNITS``, the second group of items is build from | ||||
|     :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`. | ||||
| 
 | ||||
|     If you search this list for a symbol, then a match with a symbol from | ||||
|     Wikidata has the highest weighting (first hit in the list), followed by the | ||||
|     symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is | ||||
|     given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`. | ||||
| 
 | ||||
|     """ | ||||
| 
 | ||||
|     global SYMBOL_TO_SI  # pylint: disable=global-statement | ||||
|     if SYMBOL_TO_SI: | ||||
|         return SYMBOL_TO_SI | ||||
| 
 | ||||
|     # filter out units which can't be normalized to a SI unit and filter out | ||||
|     # units without a symbol / arcsecond does not have a symbol | ||||
|     # https://www.wikidata.org/wiki/Q829073 | ||||
| 
 | ||||
|     for item in data.WIKIDATA_UNITS.values(): | ||||
|         if item['to_si_factor'] and item['symbol']: | ||||
|             SYMBOL_TO_SI.append( | ||||
|                 ( | ||||
|                     item['symbol'], | ||||
|                     item['si_name'], | ||||
|                     1 / item['to_si_factor'],  # from_si | ||||
|                     item['to_si_factor'],  # to_si | ||||
|                     item['symbol'], | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|     for item in ADDITIONAL_UNITS: | ||||
|         SYMBOL_TO_SI.append( | ||||
|             ( | ||||
|                 item['symbol'], | ||||
|                 item['si_name'], | ||||
|                 item['from_si'], | ||||
|                 item['to_si'], | ||||
|                 item['symbol'], | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     alias_items = [] | ||||
|     for item in SYMBOL_TO_SI: | ||||
|         for alias in ALIAS_SYMBOLS.get(item[0], ()): | ||||
|             alias_items.append( | ||||
|                 ( | ||||
|                     alias, | ||||
|                     item[1], | ||||
|                     item[2],  # from_si | ||||
|                     item[3],  # to_si | ||||
|                     item[0],  # origin unit | ||||
|                 ) | ||||
|             ) | ||||
|     SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items | ||||
|     return SYMBOL_TO_SI | ||||
| 
 | ||||
| 
 | ||||
| def _parse_text_and_convert(from_query, to_query) -> str | None: | ||||
| 
 | ||||
|     # pylint: disable=too-many-branches, too-many-locals | ||||
|  | ||||
							
								
								
									
										231
									
								
								searx/wikidata_units.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										231
									
								
								searx/wikidata_units.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,231 @@ | ||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| """Unit conversion on the basis of `SPARQL/WIKIDATA Precision, Units and | ||||
| Coordinates`_ | ||||
| 
 | ||||
| .. _SPARQL/WIKIDATA Precision, Units and Coordinates: | ||||
|    https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities | ||||
| """ | ||||
| 
 | ||||
| __all__ = ["convert_from_si", "convert_to_si", "symbol_to_si"] | ||||
| 
 | ||||
| import collections | ||||
| 
 | ||||
| from searx import data | ||||
| from searx.engines import wikidata | ||||
| 
 | ||||
| ADDITIONAL_UNITS = [ | ||||
|     { | ||||
|         "si_name": "Q11579", | ||||
|         "symbol": "°C", | ||||
|         "to_si": lambda val: val + 273.15, | ||||
|         "from_si": lambda val: val - 273.15, | ||||
|     }, | ||||
|     { | ||||
|         "si_name": "Q11579", | ||||
|         "symbol": "°F", | ||||
|         "to_si": lambda val: (val + 459.67) * 5 / 9, | ||||
|         "from_si": lambda val: (val * 9 / 5) - 459.67, | ||||
|     }, | ||||
| ] | ||||
| """Additional items to convert from a measure unit to a SI unit (vice versa). | ||||
| 
 | ||||
| .. code:: python | ||||
| 
 | ||||
|     { | ||||
|         "si_name": "Q11579",                 # Wikidata item ID of the SI unit (Kelvin) | ||||
|         "symbol": "°C",                      # symbol of the measure unit | ||||
|         "to_si": lambda val: val + 273.15,   # convert measure value (val) to SI unit | ||||
|         "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit | ||||
|     }, | ||||
|     { | ||||
|         "si_name": "Q11573", | ||||
|         "symbol": "mi", | ||||
|         "to_si": 1609.344,                   # convert measure value (val) to SI unit | ||||
|         "from_si": 1 / 1609.344              # convert SI value (val) measure unit | ||||
|     }, | ||||
| 
 | ||||
| The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier) | ||||
| or a callable_ (val in / converted value returned). | ||||
| 
 | ||||
| .. _callable: https://docs.python.org/3/glossary.html#term-callable | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| ALIAS_SYMBOLS = { | ||||
|     '°C': ('C',), | ||||
|     '°F': ('F',), | ||||
|     'mi': ('L',), | ||||
| } | ||||
| """Alias symbols for known unit of measure symbols / by example:: | ||||
| 
 | ||||
|     '°C': ('C', ...),  # list of alias symbols for °C (Q69362731) | ||||
|     '°F': ('F', ...),  # list of alias symbols for °F (Q99490479) | ||||
|     'mi': ('L',),      # list of alias symbols for mi (Q253276) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| SYMBOL_TO_SI = [] | ||||
| UNITS_BY_SI_NAME: dict | None = None | ||||
| 
 | ||||
| 
 | ||||
| def convert_from_si(si_name: str, symbol: str, value: float | int) -> float: | ||||
|     from_si = units_by_si_name(si_name)[symbol][symbol]["from_si"] | ||||
|     if isinstance(from_si, (float, int)): | ||||
|         value = float(value) * from_si | ||||
|     else: | ||||
|         value = from_si(float(value)) | ||||
|     return value | ||||
| 
 | ||||
| 
 | ||||
| def convert_to_si(si_name: str, symbol: str, value: float | int) -> float: | ||||
|     to_si = units_by_si_name(si_name)[symbol][symbol]["to_si"] | ||||
|     if isinstance(to_si, (float, int)): | ||||
|         value = float(value) * to_si | ||||
|     else: | ||||
|         value = to_si(float(value)) | ||||
|     return value | ||||
| 
 | ||||
| 
 | ||||
| def units_by_si_name(si_name): | ||||
| 
 | ||||
|     global UNITS_BY_SI_NAME | ||||
|     if UNITS_BY_SI_NAME is not None: | ||||
|         return UNITS_BY_SI_NAME[si_name] | ||||
| 
 | ||||
|     UNITS_BY_SI_NAME = {} | ||||
|     for item in symbol_to_si(): | ||||
|         by_symbol = UNITS_BY_SI_NAME.get(si_name) | ||||
|         if by_symbol is None: | ||||
|             by_symbol = {} | ||||
|             UNITS_BY_SI_NAME[si_name] = by_symbol | ||||
|         by_symbol[item["symbol"]] = item | ||||
|     return UNITS_BY_SI_NAME[si_name] | ||||
| 
 | ||||
| 
 | ||||
| def symbol_to_si(): | ||||
|     """Generates a list of tuples, each tuple is a measure unit and the fields | ||||
|     in the tuple are: | ||||
| 
 | ||||
|     0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276) | ||||
| 
 | ||||
|     1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre') | ||||
| 
 | ||||
|     2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m | ||||
|        multiplied by 1609.344) | ||||
| 
 | ||||
|     3. Factor to get measure value from from SI value (e.g. SI 100m is equal to | ||||
|        100mi divided by 1609.344) | ||||
| 
 | ||||
|     The returned list is sorted, the first items are created from | ||||
|     ``WIKIDATA_UNITS``, the second group of items is build from | ||||
|     :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`. | ||||
| 
 | ||||
|     If you search this list for a symbol, then a match with a symbol from | ||||
|     Wikidata has the highest weighting (first hit in the list), followed by the | ||||
|     symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is | ||||
|     given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`. | ||||
| 
 | ||||
|     """ | ||||
| 
 | ||||
|     global SYMBOL_TO_SI  # pylint: disable=global-statement | ||||
|     if SYMBOL_TO_SI: | ||||
|         return SYMBOL_TO_SI | ||||
| 
 | ||||
|     # filter out units which can't be normalized to a SI unit and filter out | ||||
|     # units without a symbol / arcsecond does not have a symbol | ||||
|     # https://www.wikidata.org/wiki/Q829073 | ||||
| 
 | ||||
|     for item in data.WIKIDATA_UNITS.values(): | ||||
|         if item['to_si_factor'] and item['symbol']: | ||||
|             SYMBOL_TO_SI.append( | ||||
|                 ( | ||||
|                     item['symbol'], | ||||
|                     item['si_name'], | ||||
|                     1 / item['to_si_factor'],  # from_si | ||||
|                     item['to_si_factor'],  # to_si | ||||
|                     item['symbol'], | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|     for item in ADDITIONAL_UNITS: | ||||
|         SYMBOL_TO_SI.append( | ||||
|             ( | ||||
|                 item['symbol'], | ||||
|                 item['si_name'], | ||||
|                 item['from_si'], | ||||
|                 item['to_si'], | ||||
|                 item['symbol'], | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     alias_items = [] | ||||
|     for item in SYMBOL_TO_SI: | ||||
|         for alias in ALIAS_SYMBOLS.get(item[0], ()): | ||||
|             alias_items.append( | ||||
|                 ( | ||||
|                     alias, | ||||
|                     item[1], | ||||
|                     item[2],  # from_si | ||||
|                     item[3],  # to_si | ||||
|                     item[0],  # origin unit | ||||
|                 ) | ||||
|             ) | ||||
|     SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items | ||||
|     return SYMBOL_TO_SI | ||||
| 
 | ||||
| 
 | ||||
| # the response contains duplicate ?item with the different ?symbol | ||||
| # "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result | ||||
| # even if a ?item has different ?symbol of the same rank. | ||||
| # A deterministic result | ||||
| # see: | ||||
| # * https://www.wikidata.org/wiki/Help:Ranking | ||||
| # * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) | ||||
| # * https://w.wiki/32BT | ||||
| # * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities | ||||
| #   see the result for https://www.wikidata.org/wiki/Q11582 | ||||
| #   there are multiple symbols the same rank | ||||
| 
 | ||||
| SARQL_REQUEST = """ | ||||
| SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit | ||||
| WHERE | ||||
| { | ||||
|   ?item wdt:P31/wdt:P279 wd:Q47574 . | ||||
|   ?item p:P5061 ?symbolP . | ||||
|   ?symbolP ps:P5061 ?symbol ; | ||||
|            wikibase:rank ?rank . | ||||
|   OPTIONAL { | ||||
|     ?item p:P2370 ?tosistmt . | ||||
|     ?tosistmt psv:P2370 ?tosinode . | ||||
|     ?tosinode wikibase:quantityAmount ?tosi . | ||||
|     ?tosinode wikibase:quantityUnit ?tosiUnit . | ||||
|   } | ||||
|   FILTER(LANG(?symbol) = "en"). | ||||
| } | ||||
| ORDER BY ?item DESC(?rank) ?symbol | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| def fetch_units(): | ||||
|     """Fetch units from Wikidata.  Function is used to update persistence of | ||||
|     :py:obj:`searx.data.WIKIDATA_UNITS`.""" | ||||
| 
 | ||||
|     results = collections.OrderedDict() | ||||
|     response = wikidata.send_wikidata_query(SARQL_REQUEST) | ||||
|     for unit in response['results']['bindings']: | ||||
| 
 | ||||
|         symbol = unit['symbol']['value'] | ||||
|         name = unit['item']['value'].rsplit('/', 1)[1] | ||||
|         si_name = unit.get('tosiUnit', {}).get('value', '') | ||||
|         if si_name: | ||||
|             si_name = si_name.rsplit('/', 1)[1] | ||||
| 
 | ||||
|         to_si_factor = unit.get('tosi', {}).get('value', '') | ||||
|         if name not in results: | ||||
|             # ignore duplicate: always use the first one | ||||
|             results[name] = { | ||||
|                 'symbol': symbol, | ||||
|                 'si_name': si_name if si_name else None, | ||||
|                 'to_si_factor': float(to_si_factor) if to_si_factor else None, | ||||
|             } | ||||
|     return results | ||||
| @ -8,76 +8,15 @@ Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data | ||||
| """ | ||||
| 
 | ||||
| import json | ||||
| import collections | ||||
| 
 | ||||
| # set path | ||||
| from os.path import join | ||||
| 
 | ||||
| from searx import searx_dir | ||||
| from searx.engines import wikidata, set_loggers | ||||
| from searx.data import data_dir | ||||
| from searx.wikidata_units import fetch_units | ||||
| 
 | ||||
| DATA_FILE = data_dir / 'wikidata_units.json' | ||||
| 
 | ||||
| set_loggers(wikidata, 'wikidata') | ||||
| 
 | ||||
| # the response contains duplicate ?item with the different ?symbol | ||||
| # "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result | ||||
| # even if a ?item has different ?symbol of the same rank. | ||||
| # A deterministic result | ||||
| # see: | ||||
| # * https://www.wikidata.org/wiki/Help:Ranking | ||||
| # * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) | ||||
| # * https://w.wiki/32BT | ||||
| # * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities | ||||
| #   see the result for https://www.wikidata.org/wiki/Q11582 | ||||
| #   there are multiple symbols the same rank | ||||
| SARQL_REQUEST = """ | ||||
| SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit | ||||
| WHERE | ||||
| { | ||||
|   ?item wdt:P31/wdt:P279 wd:Q47574 . | ||||
|   ?item p:P5061 ?symbolP . | ||||
|   ?symbolP ps:P5061 ?symbol ; | ||||
|            wikibase:rank ?rank . | ||||
|   OPTIONAL { | ||||
|     ?item p:P2370 ?tosistmt . | ||||
|     ?tosistmt psv:P2370 ?tosinode . | ||||
|     ?tosinode wikibase:quantityAmount ?tosi . | ||||
|     ?tosinode wikibase:quantityUnit ?tosiUnit . | ||||
|   } | ||||
|   FILTER(LANG(?symbol) = "en"). | ||||
| } | ||||
| ORDER BY ?item DESC(?rank) ?symbol | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| def get_data(): | ||||
|     results = collections.OrderedDict() | ||||
|     response = wikidata.send_wikidata_query(SARQL_REQUEST) | ||||
|     for unit in response['results']['bindings']: | ||||
| 
 | ||||
|         symbol = unit['symbol']['value'] | ||||
|         name = unit['item']['value'].rsplit('/', 1)[1] | ||||
|         si_name = unit.get('tosiUnit', {}).get('value', '') | ||||
|         if si_name: | ||||
|             si_name = si_name.rsplit('/', 1)[1] | ||||
| 
 | ||||
|         to_si_factor = unit.get('tosi', {}).get('value', '') | ||||
|         if name not in results: | ||||
|             # ignore duplicate: always use the first one | ||||
|             results[name] = { | ||||
|                 'symbol': symbol, | ||||
|                 'si_name': si_name if si_name else None, | ||||
|                 'to_si_factor': float(to_si_factor) if to_si_factor else None, | ||||
|             } | ||||
|     return results | ||||
| 
 | ||||
| 
 | ||||
| def get_wikidata_units_filename(): | ||||
|     return join(join(searx_dir, "data"), "") | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     with DATA_FILE.open('w', encoding="utf8") as f: | ||||
|         json.dump(get_data(), f, indent=4, sort_keys=True, ensure_ascii=False) | ||||
|         json.dump(fetch_units(), f, indent=4, sort_keys=True, ensure_ascii=False) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user