update NLP for ingredients

2025-07-09 03:04:54 -04:00 · 2021-08-29 17:10:51 -08:00 · 2021-08-29 17:10:51 -08:00 · 2e6352cfbd
commit 2e6352cfbd
parent 086098899d
7 changed files with 164 additions and 22 deletions
--- a/mealie/services/backups/imports.py
+++ b/mealie/services/backups/imports.py
@ -22,7 +22,7 @@ from mealie.schema.admin import (
 )
 from mealie.schema.events import EventNotificationIn
 from mealie.schema.recipe import CommentOut, Recipe
-from mealie.schema.user import UpdateGroup, PrivateUser
+from mealie.schema.user import PrivateUser, UpdateGroup
 from mealie.services.image import minify
--- a/mealie/services/scraper/ingredient_nlp/pre_processor.py
+++ b/mealie/services/scraper/ingredient_nlp/pre_processor.py
@ -0,0 +1,97 @@
 import re
 import unicodedata
 replace_abbreviations = {
    "cup ": "cup ",
    "g ": "gram ",
    "kg ": "kilogram ",
    "lb ": "pound ",
    "ml ": "milliliter ",
    "oz ": "ounce ",
    "pint ": "pint ",
    "qt ": "quart ",
    "tbs ": "tablespoon ",
    "tbsp ": "tablespoon ",
    "tsp ": "teaspoon ",
 }
 def replace_common_abbreviations(string: str) -> str:
    for k, v in replace_abbreviations.items():
        string = string.replace(k, v)
    return string
 def remove_periods(string: str) -> str:
    """Removes periods not sournded by digets"""
    return re.sub(r"(?<!\d)\.(?!\d)", "", string)
 def replace_fraction_unicode(string: str):
    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
    # TODO: Breaks on multiple unicode fractions
    for c in string:
        try:
            name = unicodedata.name(c)
        except ValueError:
            continue
        if name.startswith("VULGAR FRACTION"):
            normalized = unicodedata.normalize("NFKC", c)
            numerator, _slash, denominator = normalized.partition("⁄")
            text = f" {numerator}/{denominator}"
            return string.replace(c, text).replace("  ", " ")
    return string
 def wrap_or_clause(string: str):
    """
    Attempts to wrap or clauses in ()
    Examples:
    '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
    """
    # TODO: Needs more adequite testing to be sure this doens't have side effects.
    split_by_or = string.split(" or ")
    split_by_comma = split_by_or[1].split(",")
    if len(split_by_comma) > 0:
        return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
    return string
 def pre_process_string(string: str) -> str:
    """
    Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
    {qty} {unit} {food}, {additional}
    1 tbs. wine, expensive or other white wine, plus more
    """
    string = string.lower()
    string = replace_fraction_unicode(string)
    string = remove_periods(string)
    string = replace_common_abbreviations(string)
    if " or " in string:
        string = wrap_or_clause(string)
    return string
 def main():
    # TODO: Migrate to unittests
    print("Starting...")
    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
    print(pre_process_string("¼ cup michiu tou or other rice wine"))
    print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
    print("Finished...")
 if __name__ == "__main__":
    main()
--- a/mealie/services/scraper/ingredient_nlp/processor.py
+++ b/mealie/services/scraper/ingredient_nlp/processor.py
@ -1,17 +1,17 @@
 import subprocess
 import tempfile
 import unicodedata
 from fractions import Fraction
 from pathlib import Path
 from typing import Optional
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
 from mealie.core.config import settings
 from mealie.schema.recipe import RecipeIngredient
 from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
 from . import utils
 from .pre_processor import pre_process_string
 CWD = Path(__file__).parent
 MODEL_PATH = CWD / "model.crfmodel"
@ -33,6 +33,17 @@ class CRFIngredient(BaseModel):
    comment: Optional[str] = ""
    unit: Optional[str] = ""
    @validator("qty", always=True, pre=True)
    def validate_qty(qty, values):  # sourcery skip: merge-nested-ifs
        if qty is None or qty == "":
            # Check if other contains a fraction
            if values["other"] is not None and values["other"].find("/") != -1:
                return float(Fraction(values["other"])).__round__(1)
            else:
                return 1
        return qty
 def _exec_crf_test(input_text):
    with tempfile.NamedTemporaryFile(mode="w") as input_file:
@ -43,24 +54,8 @@ def _exec_crf_test(input_text):
        )
 def fraction_finder(string: str):
    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
    for c in string:
        try:
            name = unicodedata.name(c)
        except ValueError:
            continue
        if name.startswith("VULGAR FRACTION"):
            normalized = unicodedata.normalize("NFKC", c)
            numerator, _slash, denominator = normalized.partition("⁄")
            text = f"{numerator}/{denominator}"
            return string.replace(c, text)
    return string
 def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
-    crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text])
+    crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
    crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
@ -89,4 +84,4 @@ if __name__ == "__main__":
    ingredients = convert_crf_models_to_ingredients(crf_models)
    for ingredient in ingredients:
-        print(ingredient)
+        print(ingredient.input)
--- a/mealie/services/scraper/ingredient_nlp/tokenizer.py
+++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py
@ -28,6 +28,7 @@ def tokenize(s):
    s = re.sub(r"(\d+)oz", r"\1 ounces", s)
    s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
    # TODO: Replace american_units with list of units from database?
    american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
    # The following removes slashes following American units and replaces it with a space.
    for unit in american_units:
--- a/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py
+++ b/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py
--- a/mealie/services/scraper/ingredient_nlp/utils.py
+++ b/mealie/services/scraper/ingredient_nlp/utils.py
@ -47,7 +47,7 @@ def unclump(s):
 def normalizeToken(s):
    """
-    ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
+    TODO: FIX THIS. We used to use the pattern.en package to singularize words, but
    in the name of simple deployments, we took it out. We should fix this at some
    point.
    """
@ -222,6 +222,20 @@ def import_data(lines):
            tag, confidence = re.split(r"/", columns[-1], 1)
            tag = re.sub(r"^[BI]\-", "", tag).lower()
            # TODO: Integrate Confidence into API Response
            print("Confidence", confidence)
            # new token
            if prevTag != tag or token == "n/a":
                display[-1].append((tag, [token]))
                data[-1][tag] = []
                prevTag = tag
            # continuation
            else:
                display[-1][-1][1].append(token)
                data[-1][tag].append(token)
            # ---- DISPLAY ----
            # build a structure which groups each token by its tag, so we can
            # rebuild the original display name later.
--- a/tests/unit_tests/test_nlp_parser.py
+++ b/tests/unit_tests/test_nlp_parser.py
@ -0,0 +1,35 @@
 from dataclasses import dataclass
 from fractions import Fraction
 from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model
@dataclass
 class TestIngredient:
    input: str
    quantity: float
 test_ingredients = [
    TestIngredient("½ cup all-purpose flour", 0.5),
    TestIngredient("1 ½ teaspoons ground black pepper", 1.5),
    TestIngredient("⅔ cup unsweetened flaked coconut", 0.7),
    TestIngredient("⅓ cup panko bread crumbs", 0.3),
 ]
 def test_nlp_parser():
    models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
    # Itterate over mdoels and test_ingreidnets to gether
    print()
    for model, test_ingredient in zip(models, test_ingredients):
        print("Testing:", test_ingredient.input, end="")
        assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity
        print(" ✅ Passed")
 if __name__ == "__main__":
    test_nlp_parser()