feat: search tokenization, handling of quoted literal search, and postgres fuzziness (#2351)

* Creating postgres migration script and starting to set up to detect database

* non-working placeholders for postgres pg_tgrm

* First draft of some indexes

* non-working commit of postgres indexing

* Further non-working edits to db-centric fuzzy search

* update alembic for extensions

* More non-working setup

* Move db type check to init_db

* fix typo in db name check

* Add sqlite token search and postgres full text search

* reorder search to hit exact matches faster

* Add settings and docs for POSTGRES_LANGUAGE (full text search)

* Use user-specified POSTGRES_LANGUAGE in search

* fix fuzzy search typo

* Remove full text search and instead order by trigram match

* cleaner adding of indices, remove fulltext

* Cleanup old import of getting app settings

* Fix typo in index

* Fix some alembic fuzzy typos

* Remove diagnostic printing from alembic migration

* Fix mixed up commutator for trigram operator and relax criteria

* forgot to remove query debug

* sort only on name

* token and fuzzy search tests

* Refactor recipe search test to avoid rare random string cross-matches.

* Add ability to quote parts of search for exact match

* Remove internal punctuation, unless it's quoted for literal search

* Add tests for special character removal and literal search

* Remove the outer double quotes from searches, but leave internal single quotes alone.

* Update tests to avoid intra-test name collisions

* Fixing leftovers highlighted by lint

* cleanup linting and mypy errors

* Fix test cross-matching on dirty db (leftovers from bulk import)

* forgot to cleanup something when debugging mypy errors

* re-order pg_trgm loading in postgres

* address comments
This commit is contained in:
Jacob Corn 2023-05-28 19:46:53 +02:00 committed by GitHub
parent 27ebb4c462
commit 7e0d29afc7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 304 additions and 43 deletions

View File

@ -0,0 +1,89 @@
"""postgres fuzzy search
Revision ID: b3dbb554ba53
Revises: 38514b39a824
Create Date: 2023-04-13 06:47:04.617131
"""
import sqlalchemy as sa
import mealie.db.migration_types
from alembic import op
import alembic.context as context
from mealie.core.config import get_app_settings
# revision identifiers, used by Alembic.
revision = "b3dbb554ba53"
down_revision = "38514b39a824"
branch_labels = None
depends_on = None
def get_db_type():
return op.get_context().dialect.name
def setup_postgres_trigrams():
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
op.create_index(
"ix_recipes_name_normalized_gin",
table_name="recipes",
columns=["name_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"name_normalized": "gin_trgm_ops",
},
)
op.create_index(
"ix_recipes_description_normalized_gin",
table_name="recipes",
columns=["description_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"description_normalized": "gin_trgm_ops",
},
)
op.create_index(
"ix_recipes_ingredients_note_normalized_gin",
table_name="recipes_ingredients",
columns=["note_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"note_normalized": "gin_trgm_ops",
},
)
op.create_index(
"ix_recipes_ingredients_original_text_normalized_gin",
table_name="recipes_ingredients",
columns=["original_text_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"original_text_normalized": "gin_trgm_ops",
},
)
def remove_postgres_trigrams():
op.execute("DROP EXTENSION IF EXISTS pg_trgm;")
op.drop_index("ix_recipes_name_normalized_gin", table_name="recipe")
op.drop_index("ix_recipes_description_normalized_gin", table_name="recipe")
op.drop_index("ix_recipes_ingredients_note_normalized_gin", table_name="recipes_ingredients")
op.drop_index("ix_recipes_ingredients_original_text_normalized_gin", table_name="recipes_ingredients")
def upgrade():
if get_db_type() == "postgresql":
setup_postgres_trigrams()
else:
pass
def downgrade():
if get_db_type() == "postgres":
remove_postgres_trigrams()
else:
pass

View File

@ -68,6 +68,13 @@ Yes, you can install Mealie on your local machine. HOWEVER, it is recommended th
- [Advanced Installation](../installation/advanced/)
## What is fuzzy search and how do I use it?
Mealie can use fuzzy search, which is robust to minor typos. For example, searching for "brocolli" will still find your recipe for "broccoli soup". But fuzzy search is only functional on a Postgres database backend. To enable fuzzy search you will need to migrate to Postgres:
1. Backup your database and download the .zip file (same as when [migrating](./migrating-to-mealie-v1.md))
2. Set up a [Postgres](./installation/postgres.md) instance of Mealie
3. Upload the backup .zip and click to apply it (as as migration)
## How i can attach an image or video to a Recipe?
Yes. Mealie's Recipe Steps and other fields support the markdown syntax and therefor supports images and videos. To attach an image to the recipe, you can upload it as an asset and use the provided copy button to generate the html image tag required to render the image. For videos, Mealie provides no way to host videos. You'll need to host your videos with another provider and embed them in your recipe. Generally, the video provider will provide a link to the video and the html tag required to render the video. For example, youtube provides the following link that works inside a step. You can adjust the width and height attributes as necessary to ensure a fit.

View File

@ -92,6 +92,9 @@ def main():
logger.info("Migration needed. Performing migration...")
command.upgrade(alembic_cfg, "head")
if session.get_bind().name == "postgresql": # needed for fuzzy search and fast GIN text indices
session.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;"))
db = get_repositories(session)
if db.users.get_all():

View File

@ -1,7 +1,9 @@
from typing import TYPE_CHECKING
import sqlalchemy as sa
from sqlalchemy import Boolean, Float, ForeignKey, Integer, String, event, orm
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.orm.session import Session
from text_unidecode import unidecode
from mealie.db.models._model_base import BaseMixins, SqlAlchemyBase
@ -87,7 +89,7 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins):
original_text_normalized: Mapped[str | None] = mapped_column(String, index=True)
@auto_init()
def __init__(self, note: str | None = None, orginal_text: str | None = None, **_) -> None:
def __init__(self, session: Session, note: str | None = None, orginal_text: str | None = None, **_) -> None:
# SQLAlchemy events do not seem to register things that are set during auto_init
if note is not None:
self.note_normalized = unidecode(note).lower().strip()
@ -95,13 +97,51 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins):
if orginal_text is not None:
self.orginal_text = unidecode(orginal_text).lower().strip()
tableargs = [ # base set of indices
sa.Index(
"ix_recipes_ingredients_note_normalized",
"note_normalized",
unique=False,
),
sa.Index(
"ix_recipes_ingredients_original_text_normalized",
"original_text_normalized",
unique=False,
),
]
if session.get_bind().name == "postgresql":
tableargs.extend(
[
sa.Index(
"ix_recipes_ingredients_note_normalized_gin",
"note_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"note_normalized": "gin_trgm_ops",
},
),
sa.Index(
"ix_recipes_ingredients_original_text_normalized_gin",
"original_text",
unique=False,
postgresql_using="gin",
postgresql_ops={
"original_text_normalized": "gin_trgm_ops",
},
),
]
)
# add indices
self.__table_args__ = tuple(tableargs)
@event.listens_for(RecipeIngredientModel.note, "set")
def receive_note(target: RecipeIngredientModel, value: str, oldvalue, initiator):
if value is not None:
target.name_normalized = unidecode(value).lower().strip()
target.note_normalized = unidecode(value).lower().strip()
else:
target.name_normalized = None
target.note_normalized = None
@event.listens_for(RecipeIngredientModel.original_text, "set")

View File

@ -35,7 +35,9 @@ if TYPE_CHECKING:
class RecipeModel(SqlAlchemyBase, BaseMixins):
__tablename__ = "recipes"
__table_args__ = (sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),)
__table_args__: tuple[sa.UniqueConstraint, ...] = (
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
)
id: Mapped[GUID] = mapped_column(GUID, primary_key=True, default=GUID.generate)
slug: Mapped[str | None] = mapped_column(sa.String, index=True)
@ -192,6 +194,46 @@ class RecipeModel(SqlAlchemyBase, BaseMixins):
if description is not None:
self.description_normalized = unidecode(description).lower().strip()
tableargs = [ # base set of indices
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
sa.Index(
"ix_recipes_name_normalized",
"name_normalized",
unique=False,
),
sa.Index(
"ix_recipes_description_normalized",
"description_normalized",
unique=False,
),
]
if session.get_bind().name == "postgresql":
tableargs.extend(
[
sa.Index(
"ix_recipes_name_normalized_gin",
"name_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"name_normalized": "gin_trgm_ops",
},
),
sa.Index(
"ix_recipes_description_normalized_gin",
"description_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"description_normalized": "gin_trgm_ops",
},
),
]
)
# add indices
self.__table_args__ = tuple(tableargs)
@event.listens_for(RecipeModel.name, "set")
def receive_name(target: RecipeModel, value: str, oldvalue, initiator):

View File

@ -1,10 +1,11 @@
import re as re
from collections.abc import Sequence
from random import randint
from uuid import UUID
from pydantic import UUID4
from slugify import slugify
from sqlalchemy import Select, and_, desc, func, or_, select
from sqlalchemy import Select, and_, desc, func, or_, select, text
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload
from text_unidecode import unidecode
@ -151,29 +152,94 @@ class RepositoryRecipes(RepositoryGeneric[Recipe, RecipeModel]):
return ids + additional_ids
def _add_search_to_query(self, query: Select, search: str) -> Select:
"""
0. fuzzy search (postgres only) and tokenized search are performed separately
1. take search string and do a little pre-normalization
2. look for internal quoted strings and keep them together as "literal" parts of the search
3. remove special characters from each non-literal search string
4. token search looks for any individual exact hit in name, description, and ingredients
5. fuzzy search looks for trigram hits in name, description, and ingredients
6. Sort order is determined by closeness to the recipe name
Should search also look at tags?
"""
normalized_search = unidecode(search).lower().strip()
punctuation = "!\#$%&()*+,-./:;<=>?@[\\]^_`{|}~" # string.punctuation with ' & " removed
# keep quoted phrases together as literal portions of the search string
literal = False
quoted_regex = re.compile(r"""(["'])(?:(?=(\\?))\2.)*?\1""") # thank you stack exchange!
removequotes_regex = re.compile(r"""['"](.*)['"]""")
if quoted_regex.search(normalized_search):
literal = True
temp = normalized_search
quoted_search_list = [match.group() for match in quoted_regex.finditer(temp)] # all quoted strings
quoted_search_list = [removequotes_regex.sub("\\1", x) for x in quoted_search_list] # remove outer quotes
temp = quoted_regex.sub("", temp) # remove all quoted strings, leaving just non-quoted
temp = temp.translate(
str.maketrans(punctuation, " " * len(punctuation))
) # punctuation->spaces for splitting, but only on unquoted strings
unquoted_search_list = temp.split() # all unquoted strings
normalized_search_list = quoted_search_list + unquoted_search_list
else:
#
normalized_search = normalized_search.translate(str.maketrans(punctuation, " " * len(punctuation)))
normalized_search_list = normalized_search.split()
normalized_search_list = [x.strip() for x in normalized_search_list] # remove padding whitespace inside quotes
# I would prefer to just do this in the recipe_ingredient.any part of the main query, but it turns out
# that at least sqlite wont use indexes for that correctly anymore and takes a big hit, so prefiltering it is
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
RecipeIngredientModel.note_normalized.like(f"%{normalized_search}%"),
RecipeIngredientModel.original_text_normalized.like(f"%{normalized_search}%"),
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
RecipeIngredientModel.note_normalized.op("%>")(normalized_search),
RecipeIngredientModel.original_text_normalized.op("%>")(normalized_search),
)
)
)
.scalars()
.all()
)
else: # exact token search
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
*[RecipeIngredientModel.note_normalized.like(f"%{ns}%") for ns in normalized_search_list],
*[
RecipeIngredientModel.original_text_normalized.like(f"%{ns}%")
for ns in normalized_search_list
],
)
)
)
.scalars()
.all()
)
.scalars()
.all()
)
q = query.filter(
or_(
RecipeModel.name_normalized.like(f"%{normalized_search}%"),
RecipeModel.description_normalized.like(f"%{normalized_search}%"),
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
# default = 0.7 is too strict for effective fuzzing
self.session.execute(text("set pg_trgm.word_similarity_threshold = 0.5;"))
q = query.filter(
or_(
RecipeModel.name_normalized.op("%>")(normalized_search),
RecipeModel.description_normalized.op("%>")(normalized_search),
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by( # trigram ordering could be too slow on million record db, but is fine with thousands.
func.least(
RecipeModel.name_normalized.op("<->>")(normalized_search),
)
)
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))
else: # exact token search
q = query.filter(
or_(
*[RecipeModel.name_normalized.like(f"%{ns}%") for ns in normalized_search_list],
*[RecipeModel.description_normalized.like(f"%{ns}%") for ns in normalized_search_list],
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))
return q
def page_all(

View File

@ -432,36 +432,28 @@ def test_recipe_repo_pagination_by_foods(database: AllRepositories, unique_user:
def test_recipe_repo_search(database: AllRepositories, unique_user: TestUser):
ingredient_1 = random_string(10)
ingredient_2 = random_string(10)
name_part_1 = random_string(10)
name_1 = f"{name_part_1} soup"
name_part_2 = random_string(10)
name_2 = f"Rustic {name_part_2} stew"
name_3 = f"{ingredient_1} Soup"
description_part_1 = random_string(10)
recipes = [
Recipe(
user_id=unique_user.user_id,
group_id=unique_user.group_id,
name=name_1,
description=f"My favorite {description_part_1}",
name="Steinbock Sloop",
description=f"My favorite horns are delicious",
recipe_ingredient=[
RecipeIngredient(note=ingredient_1),
RecipeIngredient(note="alpine animal"),
],
),
Recipe(
user_id=unique_user.user_id,
group_id=unique_user.group_id,
name=name_2,
name="Fiddlehead Fern Stir Fry",
recipe_ingredient=[
RecipeIngredient(note=ingredient_2),
RecipeIngredient(note="moss"),
],
),
Recipe(
user_id=unique_user.user_id,
group_id=unique_user.group_id,
name=name_3,
name="Animal Sloop",
),
# Test diacritics
Recipe(
@ -481,28 +473,50 @@ def test_recipe_repo_search(database: AllRepositories, unique_user: TestUser):
assert len(empty_result) == 0
# Search by title
title_result = database.recipes.page_all(pagination_query, search=name_part_2).items
title_result = database.recipes.page_all(pagination_query, search="Steinbock").items
assert len(title_result) == 1
assert title_result[0].name == name_2
assert title_result[0].name == "Steinbock Sloop"
# Search by description
description_result = database.recipes.page_all(pagination_query, search=description_part_1).items
description_result = database.recipes.page_all(pagination_query, search="horns").items
assert len(description_result) == 1
assert description_result[0].name == name_1
assert description_result[0].name == "Steinbock Sloop"
# Search by ingredient
ingredient_result = database.recipes.page_all(pagination_query, search=ingredient_2).items
ingredient_result = database.recipes.page_all(pagination_query, search="moss").items
assert len(ingredient_result) == 1
assert ingredient_result[0].name == name_2
assert ingredient_result[0].name == "Fiddlehead Fern Stir Fry"
# Make sure title matches are ordered in front
ordered_result = database.recipes.page_all(pagination_query, search=ingredient_1).items
ordered_result = database.recipes.page_all(pagination_query, search="animal sloop").items
assert len(ordered_result) == 2
assert ordered_result[0].name == name_3
assert ordered_result[1].name == name_1
assert ordered_result[0].name == "Animal Sloop"
assert ordered_result[1].name == "Steinbock Sloop"
# Test literal search
literal_result = database.recipes.page_all(pagination_query, search='"Animal Sloop"').items
assert len(literal_result) == 1
assert literal_result[0].name == "Animal Sloop"
# Test special character removal from non-literal searches
character_result = database.recipes.page_all(pagination_query, search="animal-sloop").items
assert len(character_result) == 2
assert character_result[0].name == "Animal Sloop"
assert character_result[1].name == "Steinbock Sloop"
# Test string normalization
normalized_result = database.recipes.page_all(pagination_query, search="ratat").items
print([r.name for r in normalized_result])
assert len(normalized_result) == 1
assert normalized_result[0].name == "Rátàtôuile"
# Test token separation
token_result = database.recipes.page_all(pagination_query, search="delicious horns").items
assert len(token_result) == 1
assert token_result[0].name == "Steinbock Sloop"
# Test fuzzy search
if database.session.get_bind().name == "postgresql":
fuzzy_result = database.recipes.page_all(pagination_query, search="Steinbuck").items
assert len(fuzzy_result) == 1
assert fuzzy_result[0].name == "Steinbock Sloop"