feat(backend): ✨ Add NLP Endpoint for Ingredient Parser (WIP)

2025-07-09 03:04:54 -04:00 · 2021-08-27 20:51:49 -08:00 · 2021-08-27 20:51:49 -08:00 · 20d847ec8e
commit 20d847ec8e
parent 161618808e
7 changed files with 417 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -149,4 +149,5 @@ dev/data/backups/dev_sample_data*.zip
 !dev/data/backups/test*.zip
 dev/data/recipes/*
 dev/scripts/output/app_routes.py
-dev/scripts/output/javascriptAPI/*
+dev/scripts/output/javascriptAPI/*
 mealie/services/scraper/ingredient_nlp/model.crfmodel
--- a/mealie/routes/recipe/init.py
+++ b/mealie/routes/recipe/init.py
@ -1,5 +1,5 @@
 from fastapi import APIRouter
-from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, recipe_crud_routes
+from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, ingredient_parser, recipe_crud_routes
 prefix = "/recipes"
@ -10,3 +10,4 @@ router.include_router(recipe_crud_routes.user_router, prefix=prefix, tags=["Reci
 router.include_router(recipe_crud_routes.public_router, prefix=prefix, tags=["Recipe: CRUD"])
 router.include_router(image_and_assets.user_router, prefix=prefix, tags=["Recipe: Images and Assets"])
 router.include_router(comments.router, prefix=prefix, tags=["Recipe: Comments"])
 router.include_router(ingredient_parser.public_router, tags=["Recipe: Ingredient Parser"])
--- a/mealie/routes/recipe/ingredient_parser.py
+++ b/mealie/routes/recipe/ingredient_parser.py
@ -0,0 +1,24 @@
 from fastapi import APIRouter
 from mealie.services.scraper.ingredient_nlp.processor import (
    convert_crf_models_to_ingredients,
    convert_list_to_crf_model,
 )
 from pydantic import BaseModel
 public_router = APIRouter()
 class IngredientRequest(BaseModel):
    ingredients: list[str]
@public_router.post("/parse/ingredient")
 def parse_ingredients(ingredients: IngredientRequest):
    """
    Parse an ingredient string.
    """
    crf_models = convert_list_to_crf_model(ingredients.ingredients)
    ingredients = convert_crf_models_to_ingredients(crf_models)
    return {"ingredient": ingredients}
--- a/mealie/services/scraper/ingredient_nlp/init.py
+++ b/mealie/services/scraper/ingredient_nlp/init.py
--- a/mealie/services/scraper/ingredient_nlp/processor.py
+++ b/mealie/services/scraper/ingredient_nlp/processor.py
@ -0,0 +1,74 @@
 import subprocess
 import tempfile
 from fractions import Fraction
 from pathlib import Path
 from typing import Optional
 from mealie.core.config import settings
 from mealie.schema.recipe import RecipeIngredient
 from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
 from pydantic import BaseModel
 from . import utils
 CWD = Path(__file__).parent
 MODEL_PATH = CWD / "model.crfmodel"
 INGREDIENT_TEXT = [
    "2 tablespoons honey",
    "1/2 cup flour",
    "Black pepper, to taste",
    "2 cups of garlic finely chopped",
    "2 liters whole milk",
 ]
 class CRFIngredient(BaseModel):
    input: Optional[str] = ""
    name: Optional[str] = ""
    other: Optional[str] = ""
    qty: Optional[str] = ""
    comment: Optional[str] = ""
    unit: Optional[str] = ""
 def _exec_crf_test(input_text):
    with tempfile.NamedTemporaryFile(mode="w") as input_file:
        input_file.write(utils.export_data(input_text))
        input_file.flush()
        return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
            "utf-8"
        )
 def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
    crf_output = _exec_crf_test(list_of_ingrdeint_text)
    crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
    for model in crf_models:
        print(model)
    return crf_models
 def convert_crf_models_to_ingredients(crf_models: list[CRFIngredient]):
    return [
        RecipeIngredient(
            title="",
            note=crf_model.comment,
            unit=CreateIngredientUnit(name=crf_model.unit),
            food=CreateIngredientFood(name=crf_model.name),
            disable_amount=settings.RECIPE_DISABLE_AMOUNT,
            quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
        )
        for crf_model in crf_models
    ]
 if __name__ == "__main__":
    crf_models = convert_list_to_crf_model(INGREDIENT_TEXT)
    ingredients = convert_crf_models_to_ingredients(crf_models)
    for ingredient in ingredients:
        print(ingredient)
--- a/mealie/services/scraper/ingredient_nlp/tokenizer.py
+++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py
@ -0,0 +1,37 @@
 import re
 def clumpFractions(s):
    """
    Replaces the whitespace between the integer and fractional part of a quantity
    with a dollar sign, so it's interpreted as a single token. The rest of the
    string is left alone.
        clumpFractions("aaa 1 2/3 bbb")
        # => "aaa 1$2/3 bbb"
    """
    return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
 def tokenize(s):
    """
    Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
    We sometimes give American units and metric units for baking recipes. For example:
        * 2 tablespoons/30 mililiters milk or cream
        * 2 1/2 cups/300 grams all-purpose flour
    The recipe database only allows for one unit, and we want to use the American one.
    But we must split the text on "cups/" etc. in order to pick it up.
    """
    # handle abbreviation like "100g" by treating it as "100 grams"
    s = re.sub(r"(\d+)g", r"\1 grams", s)
    s = re.sub(r"(\d+)oz", r"\1 ounces", s)
    s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
    american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
    # The following removes slashes following American units and replaces it with a space.
    for unit in american_units:
        s = s.replace(unit + "/", unit + " ")
        s = s.replace(unit + "s/", unit + "s ")
    return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
--- a/mealie/services/scraper/ingredient_nlp/utils.py
+++ b/mealie/services/scraper/ingredient_nlp/utils.py
@ -0,0 +1,278 @@
 import re
 from . import tokenizer
 def joinLine(columns):
    return "\t".join(columns)
 def cleanUnicodeFractions(s):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.
    "1\x215e" => "1 7/8"
    """
    fractions = {
        "\x215b": "1/8",
        "\x215c": "3/8",
        "\x215d": "5/8",
        "\x215e": "7/8",
        "\x2159": "1/6",
        "\x215a": "5/6",
        "\x2155": "1/5",
        "\x2156": "2/5",
        "\x2157": "3/5",
        "\x2158": "4/5",
        "\xbc": " 1/4",
        "\xbe": "3/4",
        "\x2153": "1/3",
        "\x2154": "2/3",
        "\xbd": "1/2",
    }
    for f_unicode, f_ascii in fractions.items():
        s = s.replace(f_unicode, " " + f_ascii)
    return s
 def unclump(s):
    """
    Replacess $'s with spaces. The reverse of clumpFractions.
    """
    return re.sub(r"\$", " ", s)
 def normalizeToken(s):
    """
    ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
    in the name of simple deployments, we took it out. We should fix this at some
    point.
    """
    return singularize(s)
 def getFeatures(token, index, tokens):
    """
    Returns a list of features for a given token.
    """
    length = len(tokens)
    return [
        ("I%s" % index),
        ("L%s" % lengthGroup(length)),
        ("Yes" if isCapitalized(token) else "No") + "CAP",
        ("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
    ]
 def singularize(word):
    """
    A poor replacement for the pattern.en singularize function, but ok for now.
    """
    units = {
        "cups": "cup",
        "tablespoons": "tablespoon",
        "teaspoons": "teaspoon",
        "pounds": "pound",
        "ounces": "ounce",
        "cloves": "clove",
        "sprigs": "sprig",
        "pinches": "pinch",
        "bunches": "bunch",
        "slices": "slice",
        "grams": "gram",
        "heads": "head",
        "quarts": "quart",
        "stalks": "stalk",
        "pints": "pint",
        "pieces": "piece",
        "sticks": "stick",
        "dashes": "dash",
        "fillets": "fillet",
        "cans": "can",
        "ears": "ear",
        "packages": "package",
        "strips": "strip",
        "bulbs": "bulb",
        "bottles": "bottle",
    }
    if word in units.keys():
        return units[word]
    else:
        return word
 def isCapitalized(token):
    """
    Returns true if a given token starts with a capital letter.
    """
    return re.match(r"^[A-Z]", token) is not None
 def lengthGroup(actualLength):
    """
    Buckets the length of the ingredient into 6 buckets.
    """
    for n in [4, 8, 12, 16, 20]:
        if actualLength < n:
            return str(n)
    return "X"
 def insideParenthesis(token, tokens):
    """
    Returns true if the word is inside parenthesis in the phrase.
    """
    if token in ["(", ")"]:
        return True
    else:
        line = " ".join(tokens)
        return re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None
 def displayIngredient(ingredient):
    """
    Format a list of (tag, [tokens]) tuples as an HTML string for display.
        displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
        # => <span class='qty'>1</span> <span class='name'>cat pie</span>
    """
    return "".join(["<span class='%s'>%s</span>" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
 # HACK: fix this
 def smartJoin(words):
    """
    Joins list of words with spaces, but is smart about not adding spaces
    before commas.
    """
    input = " ".join(words)
    # replace " , " with ", "
    input = input.replace(" , ", ", ")
    # replace " ( " with " ("
    input = input.replace("( ", "(")
    # replace " ) " with ") "
    input = input.replace(" )", ")")
    return input
 def import_data(lines):
    """
    This thing takes the output of CRF++ and turns it into an actual
    data structure.
    """
    data = [{}]
    display = [[]]
    prevTag = None
    #
    # iterate lines in the data file, which looks like:
    #
    #   # 0.511035
    #   1/2       I1  L12  NoCAP  X  B-QTY/0.982850
    #   teaspoon  I2  L12  NoCAP  X  B-UNIT/0.982200
    #   fresh     I3  L12  NoCAP  X  B-COMMENT/0.716364
    #   thyme     I4  L12  NoCAP  X  B-NAME/0.816803
    #   leaves    I5  L12  NoCAP  X  I-NAME/0.960524
    #   ,         I6  L12  NoCAP  X  B-COMMENT/0.772231
    #   finely    I7  L12  NoCAP  X  I-COMMENT/0.825956
    #   chopped   I8  L12  NoCAP  X  I-COMMENT/0.893379
    #
    #   # 0.505999
    #   Black   I1  L8  YesCAP  X  B-NAME/0.765461
    #   pepper  I2  L8  NoCAP   X  I-NAME/0.756614
    #   ,       I3  L8  NoCAP   X  OTHER/0.798040
    #   to      I4  L8  NoCAP   X  B-COMMENT/0.683089
    #   taste   I5  L8  NoCAP   X  I-COMMENT/0.848617
    #
    # i.e. the output of crf_test -v 1
    #
    for line in lines:
        # blank line starts a new ingredient
        if line in ("", "\n"):
            data.append({})
            display.append([])
            prevTag = None
        # ignore comments
        elif line[0] == "#":
            pass
        # otherwise it's a token
        # e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
        else:
            columns = re.split("\t", line.strip())
            token = columns[0].strip()
            # unclump fractions
            token = unclump(token)
            # turn B-NAME/123 back into "name"
            tag, confidence = re.split(r"/", columns[-1], 1)
            tag = re.sub(r"^[BI]\-", "", tag).lower()
            # ---- DISPLAY ----
            # build a structure which groups each token by its tag, so we can
            # rebuild the original display name later.
            if prevTag != tag:
                display[-1].append((tag, [token]))
                prevTag = tag
            else:
                display[-1][-1][1].append(token)
                #               ^- token
                #            ^---- tag
                #        ^-------- ingredient
            # ---- DATA ----
            # build a dict grouping tokens by their tag
            # initialize this attribute if this is the first token of its kind
            if tag not in data[-1]:
                data[-1][tag] = []
            # HACK: If this token is a unit, singularize it so Scoop accepts it.
            if tag == "unit":
                token = singularize(token)
            data[-1][tag].append(token)
    # reassemble the output into a list of dicts.
    output = [
        dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
    ]
    # Add the marked-up display data
    for i, v in enumerate(output):
        output[i]["display"] = displayIngredient(display[i])
    # Add the raw ingredient phrase
    for i, v in enumerate(output):
        output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
    return output
 def export_data(lines):
    """ Parse "raw" ingredient lines into CRF-ready output """
    output = []
    for line in lines:
        line_clean = re.sub("<[^<]+?>", "", line)
        tokens = tokenizer.tokenize(line_clean)
        for i, token in enumerate(tokens):
            features = getFeatures(token, i + 1, tokens)
            output.append(joinLine([token] + features))
        output.append("")
    return "\n".join(output)