mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-05-31 20:25:14 -04:00
Better bruteforce parsing for units (#3066)
* try to match units when brute parsing and no amount is matched * brute parser: better handle multiple word food items Also checks the case when a food might have been split in a unit + ingredient * fix formatting * add test cases for ingredient parsing that don't start with an amount * parametrized tests and added ingredient data fixture * fixed group_id ref in tests * fixed test inputs * add extra tests for units as third token --------- Co-authored-by: Michael Genson <71845777+michael-genson@users.noreply.github.com>
This commit is contained in:
parent
597e6c8e0f
commit
e686fa671c
@ -132,7 +132,7 @@ def parse_ingredient(tokens) -> tuple[str, str]:
|
|||||||
return ingredient, note
|
return ingredient, note
|
||||||
|
|
||||||
|
|
||||||
def parse(ing_str) -> BruteParsedIngredient:
|
def parse(ing_str, parser) -> BruteParsedIngredient:
|
||||||
amount = 0.0
|
amount = 0.0
|
||||||
unit = ""
|
unit = ""
|
||||||
ingredient = ""
|
ingredient = ""
|
||||||
@ -192,12 +192,20 @@ def parse(ing_str) -> BruteParsedIngredient:
|
|||||||
# which means this is the ingredient
|
# which means this is the ingredient
|
||||||
ingredient = tokens[1]
|
ingredient = tokens[1]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
# can't parse first argument as amount
|
||||||
# can't parse first argument as amount
|
# try to parse as unit and ingredient (e.g. "a tblsp salt"), with unit in first three tokens
|
||||||
# -> no unit -> parse everything as ingredient
|
# won't work for units that have spaces
|
||||||
ingredient, note = parse_ingredient(tokens)
|
for index, token in enumerate(tokens[:3]):
|
||||||
except ValueError:
|
if parser.find_unit_match(token):
|
||||||
ingredient = " ".join(tokens[1:])
|
unit = token
|
||||||
|
ingredient, note = parse_ingredient(tokens[index + 1 :])
|
||||||
|
break
|
||||||
|
if not unit:
|
||||||
|
try:
|
||||||
|
# no unit -> parse everything as ingredient
|
||||||
|
ingredient, note = parse_ingredient(tokens)
|
||||||
|
except ValueError:
|
||||||
|
ingredient = " ".join(tokens[1:])
|
||||||
|
|
||||||
if unit_note not in note:
|
if unit_note not in note:
|
||||||
note += " " + unit_note
|
note += " " + unit_note
|
||||||
|
@ -126,22 +126,24 @@ class ABCIngredientParser(ABC):
|
|||||||
|
|
||||||
return store_map[fuzz_result[0]]
|
return store_map[fuzz_result[0]]
|
||||||
|
|
||||||
def find_food_match(self, food: IngredientFood | CreateIngredientFood) -> IngredientFood | None:
|
def find_food_match(self, food: IngredientFood | CreateIngredientFood | str) -> IngredientFood | None:
|
||||||
if isinstance(food, IngredientFood):
|
if isinstance(food, IngredientFood):
|
||||||
return food
|
return food
|
||||||
|
|
||||||
match_value = IngredientFoodModel.normalize(food.name)
|
food_name = food if isinstance(food, str) else food.name
|
||||||
|
match_value = IngredientFoodModel.normalize(food_name)
|
||||||
return self.find_match(
|
return self.find_match(
|
||||||
match_value,
|
match_value,
|
||||||
store_map=self.foods_by_alias,
|
store_map=self.foods_by_alias,
|
||||||
fuzzy_match_threshold=self.food_fuzzy_match_threshold,
|
fuzzy_match_threshold=self.food_fuzzy_match_threshold,
|
||||||
)
|
)
|
||||||
|
|
||||||
def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit) -> IngredientUnit | None:
|
def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit | str) -> IngredientUnit | None:
|
||||||
if isinstance(unit, IngredientUnit):
|
if isinstance(unit, IngredientUnit):
|
||||||
return unit
|
return unit
|
||||||
|
|
||||||
match_value = IngredientUnitModel.normalize(unit.name)
|
unit_name = unit if isinstance(unit, str) else unit.name
|
||||||
|
match_value = IngredientUnitModel.normalize(unit_name)
|
||||||
return self.find_match(
|
return self.find_match(
|
||||||
match_value,
|
match_value,
|
||||||
store_map=self.units_by_alias,
|
store_map=self.units_by_alias,
|
||||||
@ -155,6 +157,16 @@ class ABCIngredientParser(ABC):
|
|||||||
if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)):
|
if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)):
|
||||||
ingredient.ingredient.unit = unit_match
|
ingredient.ingredient.unit = unit_match
|
||||||
|
|
||||||
|
# Parser might have wrongly split a food into a unit and food.
|
||||||
|
if isinstance(ingredient.ingredient.food, CreateIngredientFood) and isinstance(
|
||||||
|
ingredient.ingredient.unit, CreateIngredientUnit
|
||||||
|
):
|
||||||
|
if food_match := self.find_food_match(
|
||||||
|
f"{ingredient.ingredient.unit.name} {ingredient.ingredient.food.name}"
|
||||||
|
):
|
||||||
|
ingredient.ingredient.food = food_match
|
||||||
|
ingredient.ingredient.unit = None
|
||||||
|
|
||||||
return ingredient
|
return ingredient
|
||||||
|
|
||||||
|
|
||||||
@ -164,7 +176,7 @@ class BruteForceParser(ABCIngredientParser):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def parse_one(self, ingredient: str) -> ParsedIngredient:
|
def parse_one(self, ingredient: str) -> ParsedIngredient:
|
||||||
bfi = brute.parse(ingredient)
|
bfi = brute.parse(ingredient, self)
|
||||||
|
|
||||||
parsed_ingredient = ParsedIngredient(
|
parsed_ingredient = ParsedIngredient(
|
||||||
input=ingredient,
|
input=ingredient,
|
||||||
|
@ -135,7 +135,7 @@ test_ingredients = [
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed")
|
@pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed")
|
||||||
def test_nlp_parser():
|
def test_nlp_parser() -> None:
|
||||||
models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
|
models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
|
||||||
|
|
||||||
# Iterate over models and test_ingredients to gather
|
# Iterate over models and test_ingredients to gather
|
||||||
@ -147,37 +147,102 @@ def test_nlp_parser():
|
|||||||
assert model.unit == test_ingredient.unit
|
assert model.unit == test_ingredient.unit
|
||||||
|
|
||||||
|
|
||||||
def test_brute_parser(unique_user: TestUser):
|
@pytest.mark.parametrize(
|
||||||
# input: (quantity, unit, food, comments)
|
"input, quantity, unit, food, comment",
|
||||||
expectations = {
|
[
|
||||||
# Dutch
|
pytest.param("1 theelepel koffie", 1, "theelepel", "koffie", "", id="1 theelepel koffie"),
|
||||||
"1 theelepel koffie": (1, "theelepel", "koffie", ""),
|
pytest.param("3 theelepels koffie", 3, "theelepels", "koffie", "", id="3 theelepels koffie"),
|
||||||
"3 theelepels koffie": (3, "theelepels", "koffie", ""),
|
pytest.param("1 eetlepel tarwe", 1, "eetlepel", "tarwe", "", id="1 eetlepel tarwe"),
|
||||||
"1 eetlepel tarwe": (1, "eetlepel", "tarwe", ""),
|
pytest.param("20 eetlepels bloem", 20, "eetlepels", "bloem", "", id="20 eetlepels bloem"),
|
||||||
"20 eetlepels bloem": (20, "eetlepels", "bloem", ""),
|
pytest.param("1 mespunt kaneel", 1, "mespunt", "kaneel", "", id="1 mespunt kaneel"),
|
||||||
"1 mespunt kaneel": (1, "mespunt", "kaneel", ""),
|
pytest.param("1 snuf(je) zout", 1, "snuf(je)", "zout", "", id="1 snuf(je) zout"),
|
||||||
"1 snuf(je) zout": (1, "snuf(je)", "zout", ""),
|
pytest.param(
|
||||||
"2 tbsp minced cilantro, leaves and stems": (2, "tbsp", "minced cilantro", "leaves and stems"),
|
"2 tbsp minced cilantro, leaves and stems",
|
||||||
"1 large yellow onion, coarsely chopped": (1, "large", "yellow onion", "coarsely chopped"),
|
|
||||||
"1 1/2 tsp garam masala": (1.5, "tsp", "garam masala", ""),
|
|
||||||
"2 cups mango chunks, (2 large mangoes) (fresh or frozen)": (
|
|
||||||
2,
|
2,
|
||||||
"cups",
|
"tbsp",
|
||||||
|
"minced cilantro",
|
||||||
|
"leaves and stems",
|
||||||
|
id="2 tbsp minced cilantro, leaves and stems",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
"1 large yellow onion, coarsely chopped",
|
||||||
|
1,
|
||||||
|
"large",
|
||||||
|
"yellow onion",
|
||||||
|
"coarsely chopped",
|
||||||
|
id="1 large yellow onion, coarsely chopped",
|
||||||
|
),
|
||||||
|
pytest.param("1 1/2 tsp garam masala", 1.5, "tsp", "garam masala", "", id="1 1/2 tsp garam masala"),
|
||||||
|
pytest.param(
|
||||||
|
"2 cups mango chunks, (2 large mangoes) (fresh or frozen)",
|
||||||
|
2,
|
||||||
|
"Cups",
|
||||||
"mango chunks, (2 large mangoes)",
|
"mango chunks, (2 large mangoes)",
|
||||||
"fresh or frozen",
|
"fresh or frozen",
|
||||||
|
id="2 cups mango chunks, (2 large mangoes) (fresh or frozen)",
|
||||||
),
|
),
|
||||||
}
|
pytest.param("stalk onion", 0, "Stalk", "onion", "", id="stalk onion"),
|
||||||
|
pytest.param("a stalk bell peppers", 0, "Stalk", "bell peppers", "", id="a stalk bell peppers"),
|
||||||
|
pytest.param("a tablespoon unknownFood", 0, "Tablespoon", "unknownFood", "", id="a tablespoon unknownFood"),
|
||||||
|
pytest.param(
|
||||||
|
"stalk bell peppers, cut in pieces",
|
||||||
|
0,
|
||||||
|
"Stalk",
|
||||||
|
"bell peppers",
|
||||||
|
"cut in pieces",
|
||||||
|
id="stalk bell peppers, cut in pieces",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
"a stalk bell peppers, cut in pieces",
|
||||||
|
0,
|
||||||
|
"Stalk",
|
||||||
|
"bell peppers",
|
||||||
|
"cut in pieces",
|
||||||
|
id="stalk bell peppers, cut in pieces",
|
||||||
|
),
|
||||||
|
pytest.param("red pepper flakes", 0, "", "red pepper flakes", "", id="red pepper flakes"),
|
||||||
|
pytest.param("1 red pepper flakes", 1, "", "red pepper flakes", "", id="1 red pepper flakes"),
|
||||||
|
pytest.param("1 bell peppers", 1, "", "bell peppers", "", id="1 bell peppers"),
|
||||||
|
pytest.param("1 stalk bell peppers", 1, "Stalk", "bell peppers", "", id="1 big stalk bell peppers"),
|
||||||
|
pytest.param("a big stalk bell peppers", 0, "Stalk", "bell peppers", "", id="a big stalk bell peppers"),
|
||||||
|
pytest.param(
|
||||||
|
"1 bell peppers, cut in pieces", 1, "", "bell peppers", "cut in pieces", id="1 bell peppers, cut in pieces"
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
"bell peppers, cut in pieces", 0, "", "bell peppers", "cut in pieces", id="bell peppers, cut in pieces"
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_brute_parser(
|
||||||
|
unique_local_group_id: UUID4,
|
||||||
|
parsed_ingredient_data: tuple[list[IngredientFood], list[IngredientUnit]], # required so database is populated
|
||||||
|
input: str,
|
||||||
|
quantity: int | float,
|
||||||
|
unit: str,
|
||||||
|
food: str,
|
||||||
|
comment: str,
|
||||||
|
):
|
||||||
with session_context() as session:
|
with session_context() as session:
|
||||||
parser = get_parser(RegisteredParser.brute, unique_user.group_id, session)
|
parser = get_parser(RegisteredParser.brute, unique_local_group_id, session)
|
||||||
|
parsed = parser.parse_one(input)
|
||||||
|
ing = parsed.ingredient
|
||||||
|
|
||||||
for key, val in expectations.items():
|
if ing.quantity:
|
||||||
parsed = parser.parse_one(key)
|
assert ing.quantity == quantity
|
||||||
|
else:
|
||||||
assert parsed.ingredient.quantity == val[0]
|
assert not quantity
|
||||||
assert parsed.ingredient.unit.name == val[1]
|
if ing.unit:
|
||||||
assert parsed.ingredient.food.name == val[2]
|
assert ing.unit.name == unit
|
||||||
assert parsed.ingredient.note in {val[3], None}
|
else:
|
||||||
|
assert not unit
|
||||||
|
if ing.food:
|
||||||
|
assert ing.food.name == food
|
||||||
|
else:
|
||||||
|
assert not food
|
||||||
|
if ing.note:
|
||||||
|
assert ing.note == comment
|
||||||
|
else:
|
||||||
|
assert not comment
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user