mirror of
https://github.com/searxng/searxng.git
synced 2025-10-27 08:42:31 -04:00
[fix] TrackerPatternsDB.clean_url: don't delete query argument from new_url (#5339)
The query argument for URLs like: - 'http://example.org?q=' --> query_str is 'q=' - 'http://example.org?/foo/bar' --> query_str is 'foo/bar' is a *simple string* and not a key/value dict. This string may only be removed from the URL if one of the patterns matches. BTW get_pretty_url(): keep such a *simple string* in the path element. Closes: https://github.com/searxng/searxng/issues/5299 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
d84ae96cf9
commit
33e798b01b
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""Simple implementation to store TrackerPatterns data in a SQL database."""
|
"""Simple implementation to store TrackerPatterns data in a SQL database."""
|
||||||
|
# pylint: disable=too-many-branches
|
||||||
|
|
||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
@ -119,6 +120,12 @@ class TrackerPatternsDB:
|
|||||||
|
|
||||||
for rule in self.rules():
|
for rule in self.rules():
|
||||||
|
|
||||||
|
query_str: str = parsed_new_url.query
|
||||||
|
if not query_str:
|
||||||
|
# There are no more query arguments in the parsed_new_url on
|
||||||
|
# which rules can be applied, stop iterating over the rules.
|
||||||
|
break
|
||||||
|
|
||||||
if not re.match(rule[self.Fields.url_regexp], new_url):
|
if not re.match(rule[self.Fields.url_regexp], new_url):
|
||||||
# no match / ignore pattern
|
# no match / ignore pattern
|
||||||
continue
|
continue
|
||||||
@ -136,19 +143,33 @@ class TrackerPatternsDB:
|
|||||||
# overlapping urlPattern like ".*"
|
# overlapping urlPattern like ".*"
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# remove tracker arguments from the url-query part
|
|
||||||
query_args: list[tuple[str, str]] = list(parse_qsl(parsed_new_url.query))
|
query_args: list[tuple[str, str]] = list(parse_qsl(parsed_new_url.query))
|
||||||
|
if query_args:
|
||||||
|
# remove tracker arguments from the url-query part
|
||||||
for name, val in query_args.copy():
|
for name, val in query_args.copy():
|
||||||
# remove URL arguments
|
# remove URL arguments
|
||||||
for pattern in rule[self.Fields.del_args]:
|
for pattern in rule[self.Fields.del_args]:
|
||||||
if re.match(pattern, name):
|
if re.match(pattern, name):
|
||||||
log.debug("TRACKER_PATTERNS: %s remove tracker arg: %s='%s'", parsed_new_url.netloc, name, val)
|
log.debug(
|
||||||
|
"TRACKER_PATTERNS: %s remove tracker arg: %s='%s'", parsed_new_url.netloc, name, val
|
||||||
|
)
|
||||||
query_args.remove((name, val))
|
query_args.remove((name, val))
|
||||||
|
|
||||||
parsed_new_url = parsed_new_url._replace(query=urlencode(query_args))
|
parsed_new_url = parsed_new_url._replace(query=urlencode(query_args))
|
||||||
new_url = urlunparse(parsed_new_url)
|
new_url = urlunparse(parsed_new_url)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# The query argument for URLs like:
|
||||||
|
# - 'http://example.org?q=' --> query_str is 'q=' and query_args is []
|
||||||
|
# - 'http://example.org?/foo/bar' --> query_str is 'foo/bar' and query_args is []
|
||||||
|
# is a simple string and not a key/value dict.
|
||||||
|
for pattern in rule[self.Fields.del_args]:
|
||||||
|
if re.match(pattern, query_str):
|
||||||
|
log.debug("TRACKER_PATTERNS: %s remove tracker arg: '%s'", parsed_new_url.netloc, query_str)
|
||||||
|
parsed_new_url = parsed_new_url._replace(query="")
|
||||||
|
new_url = urlunparse(parsed_new_url)
|
||||||
|
break
|
||||||
|
|
||||||
if new_url != url:
|
if new_url != url:
|
||||||
return new_url
|
return new_url
|
||||||
|
|
||||||
|
|||||||
@ -356,6 +356,12 @@ def get_pretty_url(parsed_url: urllib.parse.ParseResult):
|
|||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
path = path[:-1] if len(path) > 0 and path[-1] == '/' else path
|
path = path[:-1] if len(path) > 0 and path[-1] == '/' else path
|
||||||
path = unquote(path.replace("/", " › "))
|
path = unquote(path.replace("/", " › "))
|
||||||
|
|
||||||
|
# Keep the query argument for URLs like:
|
||||||
|
# - 'http://example.org?/foo/bar' --> parsed_url.query is 'foo/bar'
|
||||||
|
query_args: list[tuple[str, str]] = list(urllib.parse.parse_qsl(parsed_url.query))
|
||||||
|
if not query_args and parsed_url.query:
|
||||||
|
path += (" › .." if len(parsed_url.query) > 24 else " › ") + parsed_url.query[-24:]
|
||||||
return [parsed_url.scheme + "://" + parsed_url.netloc, path]
|
return [parsed_url.scheme + "://" + parsed_url.netloc, path]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user