feat(backend): ✨ Add NLP Endpoint for Ingredient Parser (WIP)

2025-11-26 16:26:19 -05:00 · 2021-08-27 20:51:49 -08:00 · 2021-08-27 20:51:49 -08:00 · 20d847ec8e
commit 20d847ec8e
parent 161618808e
7 changed files with 417 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -149,4 +149,5 @@ dev/data/backups/dev_sample_data*.zip
 !dev/data/backups/test*.zip
 dev/data/recipes/*
 dev/scripts/output/app_routes.py
-dev/scripts/output/javascriptAPI/*
+dev/scripts/output/javascriptAPI/*
+mealie/services/scraper/ingredient_nlp/model.crfmodel
--- a/mealie/routes/recipe/init.py
+++ b/mealie/routes/recipe/init.py
@ -1,5 +1,5 @@
 from fastapi import APIRouter
-from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, recipe_crud_routes
+from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, ingredient_parser, recipe_crud_routes

 prefix = "/recipes"

@ -10,3 +10,4 @@ router.include_router(recipe_crud_routes.user_router, prefix=prefix, tags=["Reci
 router.include_router(recipe_crud_routes.public_router, prefix=prefix, tags=["Recipe: CRUD"])
 router.include_router(image_and_assets.user_router, prefix=prefix, tags=["Recipe: Images and Assets"])
 router.include_router(comments.router, prefix=prefix, tags=["Recipe: Comments"])
+router.include_router(ingredient_parser.public_router, tags=["Recipe: Ingredient Parser"])
--- a/mealie/routes/recipe/ingredient_parser.py
+++ b/mealie/routes/recipe/ingredient_parser.py
@ -0,0 +1,24 @@
+from fastapi import APIRouter
+from mealie.services.scraper.ingredient_nlp.processor import (
+    convert_crf_models_to_ingredients,
+    convert_list_to_crf_model,
+)
+from pydantic import BaseModel
+
+public_router = APIRouter()
+
+
+class IngredientRequest(BaseModel):
+    ingredients: list[str]
+
+
+@public_router.post("/parse/ingredient")
+def parse_ingredients(ingredients: IngredientRequest):
+    """
+    Parse an ingredient string.
+    """
+
+    crf_models = convert_list_to_crf_model(ingredients.ingredients)
+    ingredients = convert_crf_models_to_ingredients(crf_models)
+
+    return {"ingredient": ingredients}
--- a/mealie/services/scraper/ingredient_nlp/init.py
+++ b/mealie/services/scraper/ingredient_nlp/init.py
--- a/mealie/services/scraper/ingredient_nlp/processor.py
+++ b/mealie/services/scraper/ingredient_nlp/processor.py
@ -0,0 +1,74 @@
+import subprocess
+import tempfile
+from fractions import Fraction
+from pathlib import Path
+from typing import Optional
+
+from mealie.core.config import settings
+from mealie.schema.recipe import RecipeIngredient
+from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
+from pydantic import BaseModel
+
+from . import utils
+
+CWD = Path(__file__).parent
+MODEL_PATH = CWD / "model.crfmodel"
+
+INGREDIENT_TEXT = [
+    "2 tablespoons honey",
+    "1/2 cup flour",
+    "Black pepper, to taste",
+    "2 cups of garlic finely chopped",
+    "2 liters whole milk",
+]
+
+
+class CRFIngredient(BaseModel):
+    input: Optional[str] = ""
+    name: Optional[str] = ""
+    other: Optional[str] = ""
+    qty: Optional[str] = ""
+    comment: Optional[str] = ""
+    unit: Optional[str] = ""
+
+
+def _exec_crf_test(input_text):
+    with tempfile.NamedTemporaryFile(mode="w") as input_file:
+        input_file.write(utils.export_data(input_text))
+        input_file.flush()
+        return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
+            "utf-8"
+        )
+
+
+def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
+    crf_output = _exec_crf_test(list_of_ingrdeint_text)
+
+    crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
+
+    for model in crf_models:
+        print(model)
+
+    return crf_models
+
+
+def convert_crf_models_to_ingredients(crf_models: list[CRFIngredient]):
+    return [
+        RecipeIngredient(
+            title="",
+            note=crf_model.comment,
+            unit=CreateIngredientUnit(name=crf_model.unit),
+            food=CreateIngredientFood(name=crf_model.name),
+            disable_amount=settings.RECIPE_DISABLE_AMOUNT,
+            quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
+        )
+        for crf_model in crf_models
+    ]
+
+
+if __name__ == "__main__":
+    crf_models = convert_list_to_crf_model(INGREDIENT_TEXT)
+    ingredients = convert_crf_models_to_ingredients(crf_models)
+
+    for ingredient in ingredients:
+        print(ingredient)
--- a/mealie/services/scraper/ingredient_nlp/tokenizer.py
+++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py
@ -0,0 +1,37 @@
+import re
+
+
+def clumpFractions(s):
+    """
+    Replaces the whitespace between the integer and fractional part of a quantity
+    with a dollar sign, so it's interpreted as a single token. The rest of the
+    string is left alone.
+        clumpFractions("aaa 1 2/3 bbb")
+        # => "aaa 1$2/3 bbb"
+    """
+
+    return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
+
+
+def tokenize(s):
+    """
+    Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
+    We sometimes give American units and metric units for baking recipes. For example:
+        * 2 tablespoons/30 mililiters milk or cream
+        * 2 1/2 cups/300 grams all-purpose flour
+    The recipe database only allows for one unit, and we want to use the American one.
+    But we must split the text on "cups/" etc. in order to pick it up.
+    """
+
+    # handle abbreviation like "100g" by treating it as "100 grams"
+    s = re.sub(r"(\d+)g", r"\1 grams", s)
+    s = re.sub(r"(\d+)oz", r"\1 ounces", s)
+    s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
+
+    american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
+    # The following removes slashes following American units and replaces it with a space.
+    for unit in american_units:
+        s = s.replace(unit + "/", unit + " ")
+        s = s.replace(unit + "s/", unit + "s ")
+
+    return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
--- a/mealie/services/scraper/ingredient_nlp/utils.py
+++ b/mealie/services/scraper/ingredient_nlp/utils.py
@ -0,0 +1,278 @@
+import re
+
+from . import tokenizer
+
+
+def joinLine(columns):
+    return "\t".join(columns)
+
+
+def cleanUnicodeFractions(s):
+    """
+    Replace unicode fractions with ascii representation, preceded by a
+    space.
+    "1\x215e" => "1 7/8"
+    """
+
+    fractions = {
+        "\x215b": "1/8",
+        "\x215c": "3/8",
+        "\x215d": "5/8",
+        "\x215e": "7/8",
+        "\x2159": "1/6",
+        "\x215a": "5/6",
+        "\x2155": "1/5",
+        "\x2156": "2/5",
+        "\x2157": "3/5",
+        "\x2158": "4/5",
+        "\xbc": " 1/4",
+        "\xbe": "3/4",
+        "\x2153": "1/3",
+        "\x2154": "2/3",
+        "\xbd": "1/2",
+    }
+
+    for f_unicode, f_ascii in fractions.items():
+        s = s.replace(f_unicode, " " + f_ascii)
+
+    return s
+
+
+def unclump(s):
+    """
+    Replacess $'s with spaces. The reverse of clumpFractions.
+    """
+    return re.sub(r"\$", " ", s)
+
+
+def normalizeToken(s):
+    """
+    ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
+    in the name of simple deployments, we took it out. We should fix this at some
+    point.
+    """
+    return singularize(s)
+
+
+def getFeatures(token, index, tokens):
+    """
+    Returns a list of features for a given token.
+    """
+    length = len(tokens)
+
+    return [
+        ("I%s" % index),
+        ("L%s" % lengthGroup(length)),
+        ("Yes" if isCapitalized(token) else "No") + "CAP",
+        ("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
+    ]
+
+
+def singularize(word):
+    """
+    A poor replacement for the pattern.en singularize function, but ok for now.
+    """
+
+    units = {
+        "cups": "cup",
+        "tablespoons": "tablespoon",
+        "teaspoons": "teaspoon",
+        "pounds": "pound",
+        "ounces": "ounce",
+        "cloves": "clove",
+        "sprigs": "sprig",
+        "pinches": "pinch",
+        "bunches": "bunch",
+        "slices": "slice",
+        "grams": "gram",
+        "heads": "head",
+        "quarts": "quart",
+        "stalks": "stalk",
+        "pints": "pint",
+        "pieces": "piece",
+        "sticks": "stick",
+        "dashes": "dash",
+        "fillets": "fillet",
+        "cans": "can",
+        "ears": "ear",
+        "packages": "package",
+        "strips": "strip",
+        "bulbs": "bulb",
+        "bottles": "bottle",
+    }
+
+    if word in units.keys():
+        return units[word]
+    else:
+        return word
+
+
+def isCapitalized(token):
+    """
+    Returns true if a given token starts with a capital letter.
+    """
+    return re.match(r"^[A-Z]", token) is not None
+
+
+def lengthGroup(actualLength):
+    """
+    Buckets the length of the ingredient into 6 buckets.
+    """
+    for n in [4, 8, 12, 16, 20]:
+        if actualLength < n:
+            return str(n)
+
+    return "X"
+
+
+def insideParenthesis(token, tokens):
+    """
+    Returns true if the word is inside parenthesis in the phrase.
+    """
+    if token in ["(", ")"]:
+        return True
+    else:
+        line = " ".join(tokens)
+        return re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None
+
+
+def displayIngredient(ingredient):
+    """
+    Format a list of (tag, [tokens]) tuples as an HTML string for display.
+        displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
+        # => <span class='qty'>1</span> <span class='name'>cat pie</span>
+    """
+
+    return "".join(["<span class='%s'>%s</span>" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
+
+
+# HACK: fix this
+def smartJoin(words):
+    """
+    Joins list of words with spaces, but is smart about not adding spaces
+    before commas.
+    """
+
+    input = " ".join(words)
+
+    # replace " , " with ", "
+    input = input.replace(" , ", ", ")
+
+    # replace " ( " with " ("
+    input = input.replace("( ", "(")
+
+    # replace " ) " with ") "
+    input = input.replace(" )", ")")
+
+    return input
+
+
+def import_data(lines):
+    """
+    This thing takes the output of CRF++ and turns it into an actual
+    data structure.
+    """
+    data = [{}]
+    display = [[]]
+    prevTag = None
+    #
+    # iterate lines in the data file, which looks like:
+    #
+    #   # 0.511035
+    #   1/2       I1  L12  NoCAP  X  B-QTY/0.982850
+    #   teaspoon  I2  L12  NoCAP  X  B-UNIT/0.982200
+    #   fresh     I3  L12  NoCAP  X  B-COMMENT/0.716364
+    #   thyme     I4  L12  NoCAP  X  B-NAME/0.816803
+    #   leaves    I5  L12  NoCAP  X  I-NAME/0.960524
+    #   ,         I6  L12  NoCAP  X  B-COMMENT/0.772231
+    #   finely    I7  L12  NoCAP  X  I-COMMENT/0.825956
+    #   chopped   I8  L12  NoCAP  X  I-COMMENT/0.893379
+    #
+    #   # 0.505999
+    #   Black   I1  L8  YesCAP  X  B-NAME/0.765461
+    #   pepper  I2  L8  NoCAP   X  I-NAME/0.756614
+    #   ,       I3  L8  NoCAP   X  OTHER/0.798040
+    #   to      I4  L8  NoCAP   X  B-COMMENT/0.683089
+    #   taste   I5  L8  NoCAP   X  I-COMMENT/0.848617
+    #
+    # i.e. the output of crf_test -v 1
+    #
+    for line in lines:
+        # blank line starts a new ingredient
+        if line in ("", "\n"):
+            data.append({})
+            display.append([])
+            prevTag = None
+
+        # ignore comments
+        elif line[0] == "#":
+            pass
+
+        # otherwise it's a token
+        # e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
+        else:
+
+            columns = re.split("\t", line.strip())
+            token = columns[0].strip()
+
+            # unclump fractions
+            token = unclump(token)
+
+            # turn B-NAME/123 back into "name"
+            tag, confidence = re.split(r"/", columns[-1], 1)
+            tag = re.sub(r"^[BI]\-", "", tag).lower()
+
+            # ---- DISPLAY ----
+            # build a structure which groups each token by its tag, so we can
+            # rebuild the original display name later.
+
+            if prevTag != tag:
+                display[-1].append((tag, [token]))
+                prevTag = tag
+
+            else:
+                display[-1][-1][1].append(token)
+                #               ^- token
+                #            ^---- tag
+                #        ^-------- ingredient
+
+            # ---- DATA ----
+            # build a dict grouping tokens by their tag
+
+            # initialize this attribute if this is the first token of its kind
+            if tag not in data[-1]:
+                data[-1][tag] = []
+
+            # HACK: If this token is a unit, singularize it so Scoop accepts it.
+            if tag == "unit":
+                token = singularize(token)
+
+            data[-1][tag].append(token)
+
+    # reassemble the output into a list of dicts.
+    output = [
+        dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
+    ]
+    # Add the marked-up display data
+    for i, v in enumerate(output):
+        output[i]["display"] = displayIngredient(display[i])
+
+    # Add the raw ingredient phrase
+    for i, v in enumerate(output):
+        output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
+
+    return output
+
+
+def export_data(lines):
+    """ Parse "raw" ingredient lines into CRF-ready output """
+    output = []
+    for line in lines:
+        line_clean = re.sub("<[^<]+?>", "", line)
+        tokens = tokenizer.tokenize(line_clean)
+
+        for i, token in enumerate(tokens):
+            features = getFeatures(token, i + 1, tokens)
+            output.append(joinLine([token] + features))
+        output.append("")
+    return "\n".join(output)