From 2e6352cfbd01cac0d264680ee5d6c23887f2ddcc Mon Sep 17 00:00:00 2001 From: hay-kot Date: Sun, 29 Aug 2021 17:10:51 -0800 Subject: [PATCH] update NLP for ingredients --- mealie/services/backups/imports.py | 2 +- .../scraper/ingredient_nlp/pre_processor.py | 97 +++++++++++++++++++ .../scraper/ingredient_nlp/processor.py | 35 +++---- .../scraper/ingredient_nlp/tokenizer.py | 1 + .../ingredient_nlp/unicode_fraction_dict.py | 0 .../services/scraper/ingredient_nlp/utils.py | 16 ++- tests/unit_tests/test_nlp_parser.py | 35 +++++++ 7 files changed, 164 insertions(+), 22 deletions(-) create mode 100644 mealie/services/scraper/ingredient_nlp/pre_processor.py delete mode 100644 mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py create mode 100644 tests/unit_tests/test_nlp_parser.py diff --git a/mealie/services/backups/imports.py b/mealie/services/backups/imports.py index 742705eabe6b..d46a1a475088 100644 --- a/mealie/services/backups/imports.py +++ b/mealie/services/backups/imports.py @@ -22,7 +22,7 @@ from mealie.schema.admin import ( ) from mealie.schema.events import EventNotificationIn from mealie.schema.recipe import CommentOut, Recipe -from mealie.schema.user import UpdateGroup, PrivateUser +from mealie.schema.user import PrivateUser, UpdateGroup from mealie.services.image import minify diff --git a/mealie/services/scraper/ingredient_nlp/pre_processor.py b/mealie/services/scraper/ingredient_nlp/pre_processor.py new file mode 100644 index 000000000000..a6a5d4726c47 --- /dev/null +++ b/mealie/services/scraper/ingredient_nlp/pre_processor.py @@ -0,0 +1,97 @@ +import re +import unicodedata + +replace_abbreviations = { + "cup ": "cup ", + "g ": "gram ", + "kg ": "kilogram ", + "lb ": "pound ", + "ml ": "milliliter ", + "oz ": "ounce ", + "pint ": "pint ", + "qt ": "quart ", + "tbs ": "tablespoon ", + "tbsp ": "tablespoon ", + "tsp ": "teaspoon ", +} + + +def replace_common_abbreviations(string: str) -> str: + for k, v in replace_abbreviations.items(): + string = string.replace(k, v) + + return string + + +def remove_periods(string: str) -> str: + """Removes periods not sournded by digets""" + return re.sub(r"(? '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more' + + """ + # TODO: Needs more adequite testing to be sure this doens't have side effects. + split_by_or = string.split(" or ") + + split_by_comma = split_by_or[1].split(",") + + if len(split_by_comma) > 0: + return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",") + + return string + + +def pre_process_string(string: str) -> str: + """ + Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like... + + {qty} {unit} {food}, {additional} + 1 tbs. wine, expensive or other white wine, plus more + + """ + string = string.lower() + string = replace_fraction_unicode(string) + string = remove_periods(string) + string = replace_common_abbreviations(string) + + if " or " in string: + string = wrap_or_clause(string) + + return string + + +def main(): + # TODO: Migrate to unittests + print("Starting...") + print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more")) + print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt")) + print(pre_process_string("¼ cup michiu tou or other rice wine")) + print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more")) + print("Finished...") + + +if __name__ == "__main__": + main() diff --git a/mealie/services/scraper/ingredient_nlp/processor.py b/mealie/services/scraper/ingredient_nlp/processor.py index 4dafe2c65687..7e18709cf78a 100644 --- a/mealie/services/scraper/ingredient_nlp/processor.py +++ b/mealie/services/scraper/ingredient_nlp/processor.py @@ -1,17 +1,17 @@ import subprocess import tempfile -import unicodedata from fractions import Fraction from pathlib import Path from typing import Optional -from pydantic import BaseModel +from pydantic import BaseModel, validator from mealie.core.config import settings from mealie.schema.recipe import RecipeIngredient from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit from . import utils +from .pre_processor import pre_process_string CWD = Path(__file__).parent MODEL_PATH = CWD / "model.crfmodel" @@ -33,6 +33,17 @@ class CRFIngredient(BaseModel): comment: Optional[str] = "" unit: Optional[str] = "" + @validator("qty", always=True, pre=True) + def validate_qty(qty, values): # sourcery skip: merge-nested-ifs + if qty is None or qty == "": + # Check if other contains a fraction + if values["other"] is not None and values["other"].find("/") != -1: + return float(Fraction(values["other"])).__round__(1) + else: + return 1 + + return qty + def _exec_crf_test(input_text): with tempfile.NamedTemporaryFile(mode="w") as input_file: @@ -43,24 +54,8 @@ def _exec_crf_test(input_text): ) -def fraction_finder(string: str): - # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting - for c in string: - try: - name = unicodedata.name(c) - except ValueError: - continue - if name.startswith("VULGAR FRACTION"): - normalized = unicodedata.normalize("NFKC", c) - numerator, _slash, denominator = normalized.partition("⁄") - text = f"{numerator}/{denominator}" - return string.replace(c, text) - - return string - - def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]): - crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text]) + crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text]) crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))] @@ -89,4 +84,4 @@ if __name__ == "__main__": ingredients = convert_crf_models_to_ingredients(crf_models) for ingredient in ingredients: - print(ingredient) + print(ingredient.input) diff --git a/mealie/services/scraper/ingredient_nlp/tokenizer.py b/mealie/services/scraper/ingredient_nlp/tokenizer.py index 4973d388a692..d899bfb45dcb 100644 --- a/mealie/services/scraper/ingredient_nlp/tokenizer.py +++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py @@ -28,6 +28,7 @@ def tokenize(s): s = re.sub(r"(\d+)oz", r"\1 ounces", s) s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE) + # TODO: Replace american_units with list of units from database? american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"] # The following removes slashes following American units and replaces it with a space. for unit in american_units: diff --git a/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py b/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/mealie/services/scraper/ingredient_nlp/utils.py b/mealie/services/scraper/ingredient_nlp/utils.py index f573ea93a8c7..4b49d5908abc 100644 --- a/mealie/services/scraper/ingredient_nlp/utils.py +++ b/mealie/services/scraper/ingredient_nlp/utils.py @@ -47,7 +47,7 @@ def unclump(s): def normalizeToken(s): """ - ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but + TODO: FIX THIS. We used to use the pattern.en package to singularize words, but in the name of simple deployments, we took it out. We should fix this at some point. """ @@ -222,6 +222,20 @@ def import_data(lines): tag, confidence = re.split(r"/", columns[-1], 1) tag = re.sub(r"^[BI]\-", "", tag).lower() + # TODO: Integrate Confidence into API Response + print("Confidence", confidence) + + # new token + if prevTag != tag or token == "n/a": + display[-1].append((tag, [token])) + data[-1][tag] = [] + prevTag = tag + + # continuation + else: + display[-1][-1][1].append(token) + data[-1][tag].append(token) + # ---- DISPLAY ---- # build a structure which groups each token by its tag, so we can # rebuild the original display name later. diff --git a/tests/unit_tests/test_nlp_parser.py b/tests/unit_tests/test_nlp_parser.py new file mode 100644 index 000000000000..c3e6ecb12baa --- /dev/null +++ b/tests/unit_tests/test_nlp_parser.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass +from fractions import Fraction + +from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model + + +@dataclass +class TestIngredient: + input: str + quantity: float + + +test_ingredients = [ + TestIngredient("½ cup all-purpose flour", 0.5), + TestIngredient("1 ½ teaspoons ground black pepper", 1.5), + TestIngredient("⅔ cup unsweetened flaked coconut", 0.7), + TestIngredient("⅓ cup panko bread crumbs", 0.3), +] + + +def test_nlp_parser(): + models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients]) + + # Itterate over mdoels and test_ingreidnets to gether + print() + for model, test_ingredient in zip(models, test_ingredients): + print("Testing:", test_ingredient.input, end="") + + assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity + + print(" ✅ Passed") + + +if __name__ == "__main__": + test_nlp_parser()