Better bruteforce parsing for units (#3066)

* try to match units when brute parsing and no amount is matched

* brute parser: better handle multiple word food items

Also checks the case when a food might have been split in a unit + ingredient

* fix formatting

* add test cases for ingredient parsing that don't start with an amount

* parametrized tests and added ingredient data fixture

* fixed group_id ref in tests

* fixed test inputs

* add extra tests for units as third token

---------

Co-authored-by: Michael Genson <71845777+michael-genson@users.noreply.github.com>
This commit is contained in:
RealFoxie 2024-02-07 16:16:20 +01:00 committed by GitHub
parent 597e6c8e0f
commit e686fa671c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 123 additions and 38 deletions

View File

@ -132,7 +132,7 @@ def parse_ingredient(tokens) -> tuple[str, str]:
return ingredient, note return ingredient, note
def parse(ing_str) -> BruteParsedIngredient: def parse(ing_str, parser) -> BruteParsedIngredient:
amount = 0.0 amount = 0.0
unit = "" unit = ""
ingredient = "" ingredient = ""
@ -192,12 +192,20 @@ def parse(ing_str) -> BruteParsedIngredient:
# which means this is the ingredient # which means this is the ingredient
ingredient = tokens[1] ingredient = tokens[1]
except ValueError: except ValueError:
try: # can't parse first argument as amount
# can't parse first argument as amount # try to parse as unit and ingredient (e.g. "a tblsp salt"), with unit in first three tokens
# -> no unit -> parse everything as ingredient # won't work for units that have spaces
ingredient, note = parse_ingredient(tokens) for index, token in enumerate(tokens[:3]):
except ValueError: if parser.find_unit_match(token):
ingredient = " ".join(tokens[1:]) unit = token
ingredient, note = parse_ingredient(tokens[index + 1 :])
break
if not unit:
try:
# no unit -> parse everything as ingredient
ingredient, note = parse_ingredient(tokens)
except ValueError:
ingredient = " ".join(tokens[1:])
if unit_note not in note: if unit_note not in note:
note += " " + unit_note note += " " + unit_note

View File

@ -126,22 +126,24 @@ class ABCIngredientParser(ABC):
return store_map[fuzz_result[0]] return store_map[fuzz_result[0]]
def find_food_match(self, food: IngredientFood | CreateIngredientFood) -> IngredientFood | None: def find_food_match(self, food: IngredientFood | CreateIngredientFood | str) -> IngredientFood | None:
if isinstance(food, IngredientFood): if isinstance(food, IngredientFood):
return food return food
match_value = IngredientFoodModel.normalize(food.name) food_name = food if isinstance(food, str) else food.name
match_value = IngredientFoodModel.normalize(food_name)
return self.find_match( return self.find_match(
match_value, match_value,
store_map=self.foods_by_alias, store_map=self.foods_by_alias,
fuzzy_match_threshold=self.food_fuzzy_match_threshold, fuzzy_match_threshold=self.food_fuzzy_match_threshold,
) )
def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit) -> IngredientUnit | None: def find_unit_match(self, unit: IngredientUnit | CreateIngredientUnit | str) -> IngredientUnit | None:
if isinstance(unit, IngredientUnit): if isinstance(unit, IngredientUnit):
return unit return unit
match_value = IngredientUnitModel.normalize(unit.name) unit_name = unit if isinstance(unit, str) else unit.name
match_value = IngredientUnitModel.normalize(unit_name)
return self.find_match( return self.find_match(
match_value, match_value,
store_map=self.units_by_alias, store_map=self.units_by_alias,
@ -155,6 +157,16 @@ class ABCIngredientParser(ABC):
if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)): if ingredient.ingredient.unit and (unit_match := self.find_unit_match(ingredient.ingredient.unit)):
ingredient.ingredient.unit = unit_match ingredient.ingredient.unit = unit_match
# Parser might have wrongly split a food into a unit and food.
if isinstance(ingredient.ingredient.food, CreateIngredientFood) and isinstance(
ingredient.ingredient.unit, CreateIngredientUnit
):
if food_match := self.find_food_match(
f"{ingredient.ingredient.unit.name} {ingredient.ingredient.food.name}"
):
ingredient.ingredient.food = food_match
ingredient.ingredient.unit = None
return ingredient return ingredient
@ -164,7 +176,7 @@ class BruteForceParser(ABCIngredientParser):
""" """
def parse_one(self, ingredient: str) -> ParsedIngredient: def parse_one(self, ingredient: str) -> ParsedIngredient:
bfi = brute.parse(ingredient) bfi = brute.parse(ingredient, self)
parsed_ingredient = ParsedIngredient( parsed_ingredient = ParsedIngredient(
input=ingredient, input=ingredient,

View File

@ -135,7 +135,7 @@ test_ingredients = [
@pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed") @pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed")
def test_nlp_parser(): def test_nlp_parser() -> None:
models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients]) models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
# Iterate over models and test_ingredients to gather # Iterate over models and test_ingredients to gather
@ -147,37 +147,102 @@ def test_nlp_parser():
assert model.unit == test_ingredient.unit assert model.unit == test_ingredient.unit
def test_brute_parser(unique_user: TestUser): @pytest.mark.parametrize(
# input: (quantity, unit, food, comments) "input, quantity, unit, food, comment",
expectations = { [
# Dutch pytest.param("1 theelepel koffie", 1, "theelepel", "koffie", "", id="1 theelepel koffie"),
"1 theelepel koffie": (1, "theelepel", "koffie", ""), pytest.param("3 theelepels koffie", 3, "theelepels", "koffie", "", id="3 theelepels koffie"),
"3 theelepels koffie": (3, "theelepels", "koffie", ""), pytest.param("1 eetlepel tarwe", 1, "eetlepel", "tarwe", "", id="1 eetlepel tarwe"),
"1 eetlepel tarwe": (1, "eetlepel", "tarwe", ""), pytest.param("20 eetlepels bloem", 20, "eetlepels", "bloem", "", id="20 eetlepels bloem"),
"20 eetlepels bloem": (20, "eetlepels", "bloem", ""), pytest.param("1 mespunt kaneel", 1, "mespunt", "kaneel", "", id="1 mespunt kaneel"),
"1 mespunt kaneel": (1, "mespunt", "kaneel", ""), pytest.param("1 snuf(je) zout", 1, "snuf(je)", "zout", "", id="1 snuf(je) zout"),
"1 snuf(je) zout": (1, "snuf(je)", "zout", ""), pytest.param(
"2 tbsp minced cilantro, leaves and stems": (2, "tbsp", "minced cilantro", "leaves and stems"), "2 tbsp minced cilantro, leaves and stems",
"1 large yellow onion, coarsely chopped": (1, "large", "yellow onion", "coarsely chopped"),
"1 1/2 tsp garam masala": (1.5, "tsp", "garam masala", ""),
"2 cups mango chunks, (2 large mangoes) (fresh or frozen)": (
2, 2,
"cups", "tbsp",
"minced cilantro",
"leaves and stems",
id="2 tbsp minced cilantro, leaves and stems",
),
pytest.param(
"1 large yellow onion, coarsely chopped",
1,
"large",
"yellow onion",
"coarsely chopped",
id="1 large yellow onion, coarsely chopped",
),
pytest.param("1 1/2 tsp garam masala", 1.5, "tsp", "garam masala", "", id="1 1/2 tsp garam masala"),
pytest.param(
"2 cups mango chunks, (2 large mangoes) (fresh or frozen)",
2,
"Cups",
"mango chunks, (2 large mangoes)", "mango chunks, (2 large mangoes)",
"fresh or frozen", "fresh or frozen",
id="2 cups mango chunks, (2 large mangoes) (fresh or frozen)",
), ),
} pytest.param("stalk onion", 0, "Stalk", "onion", "", id="stalk onion"),
pytest.param("a stalk bell peppers", 0, "Stalk", "bell peppers", "", id="a stalk bell peppers"),
pytest.param("a tablespoon unknownFood", 0, "Tablespoon", "unknownFood", "", id="a tablespoon unknownFood"),
pytest.param(
"stalk bell peppers, cut in pieces",
0,
"Stalk",
"bell peppers",
"cut in pieces",
id="stalk bell peppers, cut in pieces",
),
pytest.param(
"a stalk bell peppers, cut in pieces",
0,
"Stalk",
"bell peppers",
"cut in pieces",
id="stalk bell peppers, cut in pieces",
),
pytest.param("red pepper flakes", 0, "", "red pepper flakes", "", id="red pepper flakes"),
pytest.param("1 red pepper flakes", 1, "", "red pepper flakes", "", id="1 red pepper flakes"),
pytest.param("1 bell peppers", 1, "", "bell peppers", "", id="1 bell peppers"),
pytest.param("1 stalk bell peppers", 1, "Stalk", "bell peppers", "", id="1 big stalk bell peppers"),
pytest.param("a big stalk bell peppers", 0, "Stalk", "bell peppers", "", id="a big stalk bell peppers"),
pytest.param(
"1 bell peppers, cut in pieces", 1, "", "bell peppers", "cut in pieces", id="1 bell peppers, cut in pieces"
),
pytest.param(
"bell peppers, cut in pieces", 0, "", "bell peppers", "cut in pieces", id="bell peppers, cut in pieces"
),
],
)
def test_brute_parser(
unique_local_group_id: UUID4,
parsed_ingredient_data: tuple[list[IngredientFood], list[IngredientUnit]], # required so database is populated
input: str,
quantity: int | float,
unit: str,
food: str,
comment: str,
):
with session_context() as session: with session_context() as session:
parser = get_parser(RegisteredParser.brute, unique_user.group_id, session) parser = get_parser(RegisteredParser.brute, unique_local_group_id, session)
parsed = parser.parse_one(input)
ing = parsed.ingredient
for key, val in expectations.items(): if ing.quantity:
parsed = parser.parse_one(key) assert ing.quantity == quantity
else:
assert parsed.ingredient.quantity == val[0] assert not quantity
assert parsed.ingredient.unit.name == val[1] if ing.unit:
assert parsed.ingredient.food.name == val[2] assert ing.unit.name == unit
assert parsed.ingredient.note in {val[3], None} else:
assert not unit
if ing.food:
assert ing.food.name == food
else:
assert not food
if ing.note:
assert ing.note == comment
else:
assert not comment
@pytest.mark.parametrize( @pytest.mark.parametrize(