searxng/searx/utils.py
Brock Vojkovic 68ff08f224
[mod] drop fasttext-predict (#5795)
Removes the `fasttext-predict` dependency and the language detection code.

If a user now selects `auto` for the search language, the detected language now
falls back directly to the `Accept-Language` header sent by the browser (which was already the fallback when fasttext returned no result).

- fasttext's [language detection is unreliable](https://github.com/searxng/searxng/issues/4195) for some languages, especially short search queries, and in particular for queries containing proper names which is a common case.
- `fasttext-predict` consumes [significant memory](https://github.com/searxng/searxng/pull/1969#issuecomment-1345366676) without offering users much real value.
- the upstream fasttext project was archived by Meta in 2024
- users already have two better alternatives: the `Accept-Language` header and the search-syntax language prefix (e.g. `:fr` or `:de`).

Related: https://github.com/searxng/searxng/issues/4195
Closes: https://github.com/searxng/searxng/issues/5790
2026-03-06 15:40:44 +01:00

812 lines
27 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Utility functions for the engines"""
import re
import importlib
import importlib.util
import json
import types
import typing as t
from collections.abc import MutableMapping, Callable
from numbers import Number
from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from html import escape
from urllib.parse import urljoin, urlparse, parse_qs, urlencode
from datetime import timedelta
from markdown_it import MarkdownIt
from lxml import html
from lxml.etree import XPath, XPathError, XPathSyntaxError
from lxml.etree import ElementBase, _Element # pyright: ignore[reportPrivateUsage]
from searx import settings
from searx.data import USER_AGENTS, gsa_useragents_loader
from searx.version import VERSION_TAG
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import logger
logger = logger.getChild('utils')
XPathSpecType: t.TypeAlias = str | XPath
"""Type alias used by :py:obj:`searx.utils.get_xpath`,
:py:obj:`searx.utils.eval_xpath` and other XPath selectors."""
ElementType: t.TypeAlias = ElementBase | _Element
_BLOCKED_TAGS = ('script', 'style')
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
_JS_STRING_DELIMITERS = re.compile(r'(["\'`])')
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)')
_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined')
_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)")
_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)")
_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])")
_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)')
_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
_XPATH_CACHE: dict[str, XPath] = {}
_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
Replace the None value, allow explicitly pass None as a function argument"""
_NOTSET = _NotSetClass()
def searxng_useragent() -> str:
"""Return the SearXNG User Agent"""
return f"SearXNG/{VERSION_TAG} {settings['outgoing']['useragent_suffix']}".strip()
def gen_useragent(os_string: str | None = None) -> str:
"""Return a random browser User Agent
See searx/data/useragents.json
"""
return USER_AGENTS['ua'].format(
os=os_string or choice(USER_AGENTS['os']),
version=choice(USER_AGENTS['versions']),
)
def gen_gsa_useragent() -> str:
"""Return a random GSA User Agent suitable for Google
See searx/data/gsa_useragents.txt
"""
return choice(gsa_useragents_loader())
class HTMLTextExtractor(HTMLParser):
"""Internal class to extract text from HTML"""
def __init__(self):
HTMLParser.__init__(self)
self.result: list[str] = []
self.tags: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
self.tags.append(tag)
if tag == 'br':
self.result.append(' ')
def handle_endtag(self, tag: str) -> None:
if not self.tags:
return
if tag != self.tags[-1]:
self.result.append(f"</{tag}>")
return
self.tags.pop()
def is_valid_tag(self):
return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
def handle_data(self, data: str) -> None:
if not self.is_valid_tag():
return
self.result.append(data)
def handle_charref(self, name: str) -> None:
if not self.is_valid_tag():
return
if name[0] in ('x', 'X'):
codepoint = int(name[1:], 16)
else:
codepoint = int(name)
self.result.append(chr(codepoint))
def handle_entityref(self, name: str) -> None:
if not self.is_valid_tag():
return
# codepoint = htmlentitydefs.name2codepoint[name]
# self.result.append(chr(codepoint))
self.result.append(name)
def get_text(self):
return ''.join(self.result).strip()
def error(self, message: str) -> None:
# error handle is needed in <py3.10
# https://github.com/python/cpython/pull/8562/files
raise AssertionError(message)
def html_to_text(html_str: str) -> str:
"""Extract text from a HTML string
Args:
* html_str (str): string HTML
Returns:
* str: extracted text
Examples:
>>> html_to_text('Example <span id="42">#2</span>')
'Example #2'
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example'
>>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
'regexp: (?<![a-zA-Z]'
>>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
'Lorem ipsum </i>dolor sit amet</p>'
>>> html_to_text(r'&#x3e &#x3c &#97')
'> < a'
"""
if not html_str:
return ""
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split())
s = HTMLTextExtractor()
try:
s.feed(html_str)
s.close()
except AssertionError:
s = HTMLTextExtractor()
s.feed(escape(html_str, quote=True))
s.close()
return s.get_text()
def markdown_to_text(markdown_str: str) -> str:
"""Extract text from a Markdown string
Args:
* markdown_str (str): string Markdown
Returns:
* str: extracted text
Examples:
>>> markdown_to_text('[example](https://example.com)')
'example'
>>> markdown_to_text('## Headline')
'Headline'
"""
html_str: str = (
MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
)
return html_to_text(html_str)
def extract_text(
xpath_results: list[ElementType] | ElementType | str | Number | bool | None,
allow_none: bool = False,
) -> str | None:
"""Extract text from a lxml result
- If ``xpath_results`` is a list of :py:obj:`ElementType` objects, extract
the text from each result and concatenate the list in a string.
- If ``xpath_results`` is a :py:obj:`ElementType` object, extract all the
text node from it ( :py:obj:`lxml.html.tostring`, ``method="text"`` )
- If ``xpath_results`` is of type :py:obj:`str` or :py:obj:`Number`,
:py:obj:`bool` the string value is returned.
- If ``xpath_results`` is of type ``None`` a :py:obj:`ValueError` is raised,
except ``allow_none`` is ``True`` where ``None`` is returned.
"""
if isinstance(xpath_results, list):
# it's list of result : concat everything using recursive call
result = ''
for e in xpath_results:
result = result + (extract_text(e) or '')
return result.strip()
if isinstance(xpath_results, ElementType):
# it's a element
text: str = html.tostring( # type: ignore
xpath_results, # pyright: ignore[reportArgumentType]
encoding='unicode',
method='text',
with_tail=False,
)
text = text.strip().replace('\n', ' ') # type: ignore
return ' '.join(text.split()) # type: ignore
if isinstance(xpath_results, (str, Number, bool)):
return str(xpath_results)
if xpath_results is None and allow_none:
return None
if xpath_results is None and not allow_none:
raise ValueError('extract_text(None, allow_none=False)')
raise ValueError('unsupported type')
def normalize_url(url: str, base_url: str) -> str:
"""Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
Args:
* url (str): Relative URL
* base_url (str): Base URL, it must be an absolute URL.
Example:
>>> normalize_url('https://example.com', 'http://example.com/')
'https://example.com/'
>>> normalize_url('//example.com', 'http://example.com/')
'http://example.com/'
>>> normalize_url('//example.com', 'https://example.com/')
'https://example.com/'
>>> normalize_url('/path?a=1', 'https://example.com')
'https://example.com/path?a=1'
>>> normalize_url('', 'https://example.com')
'https://example.com/'
>>> normalize_url('/test', '/path')
raise ValueError
Raises:
* lxml.etree.ParserError
Returns:
* str: normalized URL
"""
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(base_url)
url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(base_url, url)
# fix relative urls that fall through the crack
if '://' not in url:
url = urljoin(base_url, url)
parsed_url = urlparse(url)
# add a / at this end of the url if there is no path
if not parsed_url.netloc:
raise ValueError('Cannot parse url')
if not parsed_url.path:
url += '/'
return url
def extract_url(xpath_results: list[ElementType] | ElementType | str | Number | bool | None, base_url: str) -> str:
"""Extract and normalize URL from lxml Element
Example:
>>> def f(s, search_url):
>>> return searx.utils.extract_url(html.fromstring(s), search_url)
>>> f('<span id="42">https://example.com</span>', 'http://example.com/')
'https://example.com/'
>>> f('https://example.com', 'http://example.com/')
'https://example.com/'
>>> f('//example.com', 'http://example.com/')
'http://example.com/'
>>> f('//example.com', 'https://example.com/')
'https://example.com/'
>>> f('/path?a=1', 'https://example.com')
'https://example.com/path?a=1'
>>> f('', 'https://example.com')
raise lxml.etree.ParserError
>>> searx.utils.extract_url([], 'https://example.com')
raise ValueError
Raises:
* ValueError
* lxml.etree.ParserError
Returns:
* str: normalized URL
"""
if xpath_results == []:
raise ValueError('Empty url resultset')
url = extract_text(xpath_results)
if url:
return normalize_url(url, base_url)
raise ValueError('URL not found')
def dict_subset(dictionary: MutableMapping[t.Any, t.Any], properties: set[str]) -> MutableMapping[str, t.Any]:
"""Extract a subset of a dict
Examples:
>>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
{'A': 'a', 'C': 'c'}
>>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
{'A': 'a'}
"""
return {k: dictionary[k] for k in properties if k in dictionary}
def humanize_bytes(size: int | float, precision: int = 2):
"""Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
s = ['B ', 'KB', 'MB', 'GB', 'TB']
x = len(s)
p = 0
while size > 1024 and p < x:
p += 1
size = size / 1024.0
return "%.*f %s" % (precision, size, s[p])
def humanize_number(size: int | float, precision: int = 0):
"""Determine the *human readable* value of a decimal number."""
s = ['', 'K', 'M', 'B', 'T']
x = len(s)
p = 0
while size > 1000 and p < x:
p += 1
size = size / 1000.0
return "%.*f%s" % (precision, size, s[p])
def convert_str_to_int(number_str: str) -> int:
"""Convert number_str to int or 0 if number_str is not a number."""
if number_str.isdigit():
return int(number_str)
return 0
def extr(txt: str, begin: str, end: str, default: str = "") -> str:
"""Extract the string between ``begin`` and ``end`` from ``txt``
:param txt: String to search in
:param begin: First string to be searched for
:param end: Second string to be searched for after ``begin``
:param default: Default value if one of ``begin`` or ``end`` is not
found. Defaults to an empty string.
:return: The string between the two search-strings ``begin`` and ``end``.
If at least one of ``begin`` or ``end`` is not found, the value of
``default`` is returned.
Examples:
>>> extr("abcde", "a", "e")
"bcd"
>>> extr("abcde", "a", "z", deafult="nothing")
"nothing"
"""
# From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
try:
first = txt.index(begin) + len(begin)
return txt[first : txt.index(end, first)]
except ValueError:
return default
def int_or_zero(num: list[str] | str) -> int:
"""Convert num to int or 0. num can be either a str or a list.
If num is a list, the first element is converted to int (or return 0 if the list is empty).
If num is a str, see convert_str_to_int
"""
if isinstance(num, list):
if len(num) < 1:
return 0
num = num[0]
return convert_str_to_int(num)
def load_module(filename: str, module_dir: str) -> types.ModuleType:
modname = splitext(filename)[0]
modpath = join(module_dir, filename)
# and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
spec = importlib.util.spec_from_file_location(modname, modpath)
if not spec:
raise ValueError(f"Error loading '{modpath}' module")
module = importlib.util.module_from_spec(spec)
if not spec.loader:
raise ValueError(f"Error loading '{modpath}' module")
spec.loader.exec_module(module)
return module
def to_string(obj: t.Any) -> str:
"""Convert obj to its string representation."""
if isinstance(obj, str):
return obj
if hasattr(obj, '__str__'):
return str(obj)
return repr(obj)
def ecma_unescape(string: str) -> str:
"""Python implementation of the unescape javascript function
https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
Examples:
>>> ecma_unescape('%u5409')
''
>>> ecma_unescape('%20')
' '
>>> ecma_unescape('%F3')
'ó'
"""
# "%u5409" becomes "吉"
string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
# "%20" becomes " ", "%F3" becomes "ó"
string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
return string
def remove_pua_from_str(string: str):
"""Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
.. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
"""
pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
s: list[str] = []
for c in string:
i = ord(c)
if any(a <= i <= b for (a, b) in pua_ranges):
continue
s.append(c)
return "".join(s)
def get_string_replaces_function(replaces: dict[str, str]) -> Callable[[str], str]:
rep = {re.escape(k): v for k, v in replaces.items()}
pattern = re.compile("|".join(rep.keys()))
def func(text: str):
return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
return func
def get_engine_from_settings(name: str) -> dict[str, dict[str, str]]:
"""Return engine configuration from settings.yml of a given engine name"""
if 'engines' not in settings:
return {}
for engine in settings['engines']:
if 'name' not in engine:
continue
if name == engine['name']:
return engine
return {}
def get_xpath(xpath_spec: XPathSpecType) -> XPath:
"""Return cached compiled :py:obj:`lxml.etree.XPath` object.
``TypeError``:
Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
:py:obj:`lxml.etree.XPath`.
``SearxXPathSyntaxException``:
Raised when there is a syntax error in the *XPath* selector (``str``).
"""
if isinstance(xpath_spec, str):
result = _XPATH_CACHE.get(xpath_spec, None)
if result is None:
try:
result = XPath(xpath_spec)
except XPathSyntaxError as e:
raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
_XPATH_CACHE[xpath_spec] = result
return result
if isinstance(xpath_spec, XPath):
return xpath_spec
raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath') # pyright: ignore[reportUnreachable]
def eval_xpath(element: ElementType, xpath_spec: XPathSpecType) -> t.Any:
"""Equivalent of ``element.xpath(xpath_str)`` but compile ``xpath_str`` into
a :py:obj:`lxml.etree.XPath` object once for all. The return value of
``xpath(..)`` is complex, read `XPath return values`_ for more details.
.. _XPath return values:
https://lxml.de/xpathxslt.html#xpath-return-values
``TypeError``:
Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
:py:obj:`lxml.etree.XPath`.
``SearxXPathSyntaxException``:
Raised when there is a syntax error in the *XPath* selector (``str``).
``SearxEngineXPathException:``
Raised when the XPath can't be evaluated (masked
:py:obj:`lxml.etree..XPathError`).
"""
xpath: XPath = get_xpath(xpath_spec)
try:
# https://lxml.de/xpathxslt.html#xpath-return-values
return xpath(element)
except XPathError as e:
arg = ' '.join([str(i) for i in e.args])
raise SearxEngineXPathException(xpath_spec, arg) from e
def eval_xpath_list(element: ElementType, xpath_spec: XPathSpecType, min_len: int | None = None) -> list[t.Any]:
"""Same as :py:obj:`searx.utils.eval_xpath`, but additionally ensures the
return value is a :py:obj:`list`. The minimum length of the list is also
checked (if ``min_len`` is set)."""
result: list[t.Any] = eval_xpath(element, xpath_spec)
if not isinstance(result, list):
raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
if min_len is not None and min_len > len(result):
raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
return result
def eval_xpath_getindex(
element: ElementType,
xpath_spec: XPathSpecType,
index: int,
default: t.Any = _NOTSET,
) -> t.Any:
"""Same as :py:obj:`searx.utils.eval_xpath_list`, but returns item on
position ``index`` from the list (index starts with ``0``).
The exceptions known from :py:obj:`searx.utils.eval_xpath` are thrown. If a
default is specified, this is returned if an element at position ``index``
could not be determined.
"""
result = eval_xpath_list(element, xpath_spec)
if -len(result) <= index < len(result):
return result[index]
if default == _NOTSET:
# raise an SearxEngineXPathException instead of IndexError to record
# xpath_spec
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
return default
def get_embeded_stream_url(url: str):
"""
Converts a standard video URL into its embed format. Supported services include Youtube,
Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
"""
parsed_url = urlparse(url)
iframe_src = None
# YouTube
if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
video_id = parse_qs(parsed_url.query).get('v', [])
if video_id:
iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
# Facebook
elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
encoded_href = urlencode({'href': url})
iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
# Instagram
elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
if parsed_url.path.endswith('/'):
iframe_src = url + 'embed'
else:
iframe_src = url + '/embed'
# TikTok
elif (
parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
and parsed_url.path.startswith('/@')
and '/video/' in parsed_url.path
):
path_parts = parsed_url.path.split('/video/')
video_id = path_parts[1]
iframe_src = 'https://www.tiktok.com/embed/' + video_id
# Dailymotion
elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
path_parts = parsed_url.path.split('/')
if len(path_parts) == 3:
video_id = path_parts[2]
iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
# Bilibili
elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
path_parts = parsed_url.path.split('/')
video_id = path_parts[2]
param_key = None
if video_id.startswith('av'):
video_id = video_id[2:]
param_key = 'aid'
elif video_id.startswith('BV'):
param_key = 'bvid'
iframe_src = (
f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
)
return iframe_src
def _j2p_process_escape(match: re.Match[str]) -> str:
# deal with ECMA escape characters
_escape = match.group(1) or match.group(2)
return (
Rf'\{_escape}'
if _escape in _JSON_PASSTHROUGH_ESCAPES
else R'\u00' if _escape == 'x' else '' if _escape == '\n' else _escape
)
def _j2p_decimal(match: re.Match[str]) -> str:
return (
match.group(1)
+ match.group(2)
+ (match.group(3).replace("_", "") or "0")
+ "."
+ (match.group(4).replace("_", "") or "0")
)
def _j2p_decimal2(match: re.Match[str]) -> str:
return match.group(1) + match.group(2) + match.group(3).replace("_", "")
def js_obj_str_to_python(js_obj_str: str) -> t.Any:
"""Convert a javascript variable into JSON and then load the value
It does not deal with all cases, but it is good enough for now.
chompjs has a better implementation.
"""
s = js_obj_str_to_json_str(js_obj_str)
# load the JSON and return the result
if s == "":
raise ValueError("js_obj_str can't be an empty string")
try:
return json.loads(s)
except json.JSONDecodeError as e:
logger.debug("Internal error: js_obj_str_to_python creates invalid JSON:\n%s", s)
raise ValueError("js_obj_str_to_python creates invalid JSON") from e
def js_obj_str_to_json_str(js_obj_str: str) -> str:
if not isinstance(js_obj_str, str):
raise ValueError("js_obj_str must be of type str")
if js_obj_str == "":
raise ValueError("js_obj_str can't be an empty string")
# when in_string is not None, it contains the character that has opened the string
# either simple quote or double quote
in_string = None
# cut the string:
# r"""{ a:"f\"irst", c:'sec"ond'}"""
# becomes
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
parts = _JS_STRING_DELIMITERS.split(js_obj_str)
# does the previous part ends with a backslash?
blackslash_just_before = False
for i, p in enumerate(parts):
if p == in_string and not blackslash_just_before:
# * the current part matches the character which has opened the string
# * there is no antislash just before
# --> the current part close the current string
in_string = None
# replace simple quote and ` by double quote
# since JSON supports only double quote for string
parts[i] = '"'
elif in_string:
# --> we are in a JS string
# replace the colon by a temporary character
# so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings
p = p.replace(':', chr(1))
# replace JS escape sequences by JSON escape sequences
p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p)
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
if in_string == "'":
p = p.replace('"', r'\"')
parts[i] = p
# deal with the sequence blackslash then quote
# since js_obj_str splits on quote, we detect this case:
# * the previous part ends with a black slash
# * the current part is a single quote
# when detected the blackslash is removed on the previous part
if blackslash_just_before and p[:1] == "'":
parts[i - 1] = parts[i - 1][:-1]
elif in_string is None and p in ('"', "'", "`"):
# we are not in string but p is string delimiter
# --> that's the start of a new string
in_string = p
# replace simple quote by double quote
# since JSON supports only double quote for string
parts[i] = '"'
elif in_string is None:
# we are not in a string
# replace by null these values:
# * void 0
# * void(0)
# * undefined
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p)
# make sure there is a leading zero in front of float
p = _JS_DECIMAL_RE.sub(_j2p_decimal, p)
p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p)
# remove extra coma in a list or an object
# for example [1,2,3,] becomes [1,2,3]
p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p)
parts[i] = p
# update for the next iteration
blackslash_just_before = len(p) > 0 and p[-1] == '\\'
# join the string
s = ''.join(parts)
# add quote arround the key
# { a: 12 }
# becomes
# { "a": 12 }
s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
# replace the surogate character by colon and strip whitespaces
s = s.replace(chr(1), ':').strip()
return s
def parse_duration_string(duration_str: str) -> timedelta | None:
"""Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
Returns None if the provided string doesn't match any of the formats.
"""
duration_str = duration_str.strip()
if not duration_str:
return None
try:
# prepending ["00"] here inits hours to 0 if they are not provided
time_parts = (["00"] + duration_str.split(":"))[:3]
hours, minutes, seconds = map(int, time_parts)
return timedelta(hours=hours, minutes=minutes, seconds=seconds)
except (ValueError, TypeError):
pass
return None