mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-07-09 03:04:54 -04:00
feat(backend): ✨ Add NLP Endpoint for Ingredient Parser (WIP)
This commit is contained in:
parent
161618808e
commit
20d847ec8e
3
.gitignore
vendored
3
.gitignore
vendored
@ -149,4 +149,5 @@ dev/data/backups/dev_sample_data*.zip
|
|||||||
!dev/data/backups/test*.zip
|
!dev/data/backups/test*.zip
|
||||||
dev/data/recipes/*
|
dev/data/recipes/*
|
||||||
dev/scripts/output/app_routes.py
|
dev/scripts/output/app_routes.py
|
||||||
dev/scripts/output/javascriptAPI/*
|
dev/scripts/output/javascriptAPI/*
|
||||||
|
mealie/services/scraper/ingredient_nlp/model.crfmodel
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from fastapi import APIRouter
|
from fastapi import APIRouter
|
||||||
from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, recipe_crud_routes
|
from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, ingredient_parser, recipe_crud_routes
|
||||||
|
|
||||||
prefix = "/recipes"
|
prefix = "/recipes"
|
||||||
|
|
||||||
@ -10,3 +10,4 @@ router.include_router(recipe_crud_routes.user_router, prefix=prefix, tags=["Reci
|
|||||||
router.include_router(recipe_crud_routes.public_router, prefix=prefix, tags=["Recipe: CRUD"])
|
router.include_router(recipe_crud_routes.public_router, prefix=prefix, tags=["Recipe: CRUD"])
|
||||||
router.include_router(image_and_assets.user_router, prefix=prefix, tags=["Recipe: Images and Assets"])
|
router.include_router(image_and_assets.user_router, prefix=prefix, tags=["Recipe: Images and Assets"])
|
||||||
router.include_router(comments.router, prefix=prefix, tags=["Recipe: Comments"])
|
router.include_router(comments.router, prefix=prefix, tags=["Recipe: Comments"])
|
||||||
|
router.include_router(ingredient_parser.public_router, tags=["Recipe: Ingredient Parser"])
|
||||||
|
24
mealie/routes/recipe/ingredient_parser.py
Normal file
24
mealie/routes/recipe/ingredient_parser.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from fastapi import APIRouter
|
||||||
|
from mealie.services.scraper.ingredient_nlp.processor import (
|
||||||
|
convert_crf_models_to_ingredients,
|
||||||
|
convert_list_to_crf_model,
|
||||||
|
)
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
public_router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
class IngredientRequest(BaseModel):
|
||||||
|
ingredients: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
@public_router.post("/parse/ingredient")
|
||||||
|
def parse_ingredients(ingredients: IngredientRequest):
|
||||||
|
"""
|
||||||
|
Parse an ingredient string.
|
||||||
|
"""
|
||||||
|
|
||||||
|
crf_models = convert_list_to_crf_model(ingredients.ingredients)
|
||||||
|
ingredients = convert_crf_models_to_ingredients(crf_models)
|
||||||
|
|
||||||
|
return {"ingredient": ingredients}
|
0
mealie/services/scraper/ingredient_nlp/__init__.py
Normal file
0
mealie/services/scraper/ingredient_nlp/__init__.py
Normal file
74
mealie/services/scraper/ingredient_nlp/processor.py
Normal file
74
mealie/services/scraper/ingredient_nlp/processor.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from fractions import Fraction
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from mealie.core.config import settings
|
||||||
|
from mealie.schema.recipe import RecipeIngredient
|
||||||
|
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from . import utils
|
||||||
|
|
||||||
|
CWD = Path(__file__).parent
|
||||||
|
MODEL_PATH = CWD / "model.crfmodel"
|
||||||
|
|
||||||
|
INGREDIENT_TEXT = [
|
||||||
|
"2 tablespoons honey",
|
||||||
|
"1/2 cup flour",
|
||||||
|
"Black pepper, to taste",
|
||||||
|
"2 cups of garlic finely chopped",
|
||||||
|
"2 liters whole milk",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class CRFIngredient(BaseModel):
|
||||||
|
input: Optional[str] = ""
|
||||||
|
name: Optional[str] = ""
|
||||||
|
other: Optional[str] = ""
|
||||||
|
qty: Optional[str] = ""
|
||||||
|
comment: Optional[str] = ""
|
||||||
|
unit: Optional[str] = ""
|
||||||
|
|
||||||
|
|
||||||
|
def _exec_crf_test(input_text):
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w") as input_file:
|
||||||
|
input_file.write(utils.export_data(input_text))
|
||||||
|
input_file.flush()
|
||||||
|
return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
|
||||||
|
crf_output = _exec_crf_test(list_of_ingrdeint_text)
|
||||||
|
|
||||||
|
crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
|
||||||
|
|
||||||
|
for model in crf_models:
|
||||||
|
print(model)
|
||||||
|
|
||||||
|
return crf_models
|
||||||
|
|
||||||
|
|
||||||
|
def convert_crf_models_to_ingredients(crf_models: list[CRFIngredient]):
|
||||||
|
return [
|
||||||
|
RecipeIngredient(
|
||||||
|
title="",
|
||||||
|
note=crf_model.comment,
|
||||||
|
unit=CreateIngredientUnit(name=crf_model.unit),
|
||||||
|
food=CreateIngredientFood(name=crf_model.name),
|
||||||
|
disable_amount=settings.RECIPE_DISABLE_AMOUNT,
|
||||||
|
quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
|
||||||
|
)
|
||||||
|
for crf_model in crf_models
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
crf_models = convert_list_to_crf_model(INGREDIENT_TEXT)
|
||||||
|
ingredients = convert_crf_models_to_ingredients(crf_models)
|
||||||
|
|
||||||
|
for ingredient in ingredients:
|
||||||
|
print(ingredient)
|
37
mealie/services/scraper/ingredient_nlp/tokenizer.py
Normal file
37
mealie/services/scraper/ingredient_nlp/tokenizer.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def clumpFractions(s):
|
||||||
|
"""
|
||||||
|
Replaces the whitespace between the integer and fractional part of a quantity
|
||||||
|
with a dollar sign, so it's interpreted as a single token. The rest of the
|
||||||
|
string is left alone.
|
||||||
|
clumpFractions("aaa 1 2/3 bbb")
|
||||||
|
# => "aaa 1$2/3 bbb"
|
||||||
|
"""
|
||||||
|
|
||||||
|
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(s):
|
||||||
|
"""
|
||||||
|
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
|
||||||
|
We sometimes give American units and metric units for baking recipes. For example:
|
||||||
|
* 2 tablespoons/30 mililiters milk or cream
|
||||||
|
* 2 1/2 cups/300 grams all-purpose flour
|
||||||
|
The recipe database only allows for one unit, and we want to use the American one.
|
||||||
|
But we must split the text on "cups/" etc. in order to pick it up.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# handle abbreviation like "100g" by treating it as "100 grams"
|
||||||
|
s = re.sub(r"(\d+)g", r"\1 grams", s)
|
||||||
|
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
|
||||||
|
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
|
||||||
|
# The following removes slashes following American units and replaces it with a space.
|
||||||
|
for unit in american_units:
|
||||||
|
s = s.replace(unit + "/", unit + " ")
|
||||||
|
s = s.replace(unit + "s/", unit + "s ")
|
||||||
|
|
||||||
|
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
|
278
mealie/services/scraper/ingredient_nlp/utils.py
Normal file
278
mealie/services/scraper/ingredient_nlp/utils.py
Normal file
@ -0,0 +1,278 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from . import tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def joinLine(columns):
|
||||||
|
return "\t".join(columns)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanUnicodeFractions(s):
|
||||||
|
"""
|
||||||
|
Replace unicode fractions with ascii representation, preceded by a
|
||||||
|
space.
|
||||||
|
"1\x215e" => "1 7/8"
|
||||||
|
"""
|
||||||
|
|
||||||
|
fractions = {
|
||||||
|
"\x215b": "1/8",
|
||||||
|
"\x215c": "3/8",
|
||||||
|
"\x215d": "5/8",
|
||||||
|
"\x215e": "7/8",
|
||||||
|
"\x2159": "1/6",
|
||||||
|
"\x215a": "5/6",
|
||||||
|
"\x2155": "1/5",
|
||||||
|
"\x2156": "2/5",
|
||||||
|
"\x2157": "3/5",
|
||||||
|
"\x2158": "4/5",
|
||||||
|
"\xbc": " 1/4",
|
||||||
|
"\xbe": "3/4",
|
||||||
|
"\x2153": "1/3",
|
||||||
|
"\x2154": "2/3",
|
||||||
|
"\xbd": "1/2",
|
||||||
|
}
|
||||||
|
|
||||||
|
for f_unicode, f_ascii in fractions.items():
|
||||||
|
s = s.replace(f_unicode, " " + f_ascii)
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def unclump(s):
|
||||||
|
"""
|
||||||
|
Replacess $'s with spaces. The reverse of clumpFractions.
|
||||||
|
"""
|
||||||
|
return re.sub(r"\$", " ", s)
|
||||||
|
|
||||||
|
|
||||||
|
def normalizeToken(s):
|
||||||
|
"""
|
||||||
|
ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
|
||||||
|
in the name of simple deployments, we took it out. We should fix this at some
|
||||||
|
point.
|
||||||
|
"""
|
||||||
|
return singularize(s)
|
||||||
|
|
||||||
|
|
||||||
|
def getFeatures(token, index, tokens):
|
||||||
|
"""
|
||||||
|
Returns a list of features for a given token.
|
||||||
|
"""
|
||||||
|
length = len(tokens)
|
||||||
|
|
||||||
|
return [
|
||||||
|
("I%s" % index),
|
||||||
|
("L%s" % lengthGroup(length)),
|
||||||
|
("Yes" if isCapitalized(token) else "No") + "CAP",
|
||||||
|
("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def singularize(word):
|
||||||
|
"""
|
||||||
|
A poor replacement for the pattern.en singularize function, but ok for now.
|
||||||
|
"""
|
||||||
|
|
||||||
|
units = {
|
||||||
|
"cups": "cup",
|
||||||
|
"tablespoons": "tablespoon",
|
||||||
|
"teaspoons": "teaspoon",
|
||||||
|
"pounds": "pound",
|
||||||
|
"ounces": "ounce",
|
||||||
|
"cloves": "clove",
|
||||||
|
"sprigs": "sprig",
|
||||||
|
"pinches": "pinch",
|
||||||
|
"bunches": "bunch",
|
||||||
|
"slices": "slice",
|
||||||
|
"grams": "gram",
|
||||||
|
"heads": "head",
|
||||||
|
"quarts": "quart",
|
||||||
|
"stalks": "stalk",
|
||||||
|
"pints": "pint",
|
||||||
|
"pieces": "piece",
|
||||||
|
"sticks": "stick",
|
||||||
|
"dashes": "dash",
|
||||||
|
"fillets": "fillet",
|
||||||
|
"cans": "can",
|
||||||
|
"ears": "ear",
|
||||||
|
"packages": "package",
|
||||||
|
"strips": "strip",
|
||||||
|
"bulbs": "bulb",
|
||||||
|
"bottles": "bottle",
|
||||||
|
}
|
||||||
|
|
||||||
|
if word in units.keys():
|
||||||
|
return units[word]
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
def isCapitalized(token):
|
||||||
|
"""
|
||||||
|
Returns true if a given token starts with a capital letter.
|
||||||
|
"""
|
||||||
|
return re.match(r"^[A-Z]", token) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def lengthGroup(actualLength):
|
||||||
|
"""
|
||||||
|
Buckets the length of the ingredient into 6 buckets.
|
||||||
|
"""
|
||||||
|
for n in [4, 8, 12, 16, 20]:
|
||||||
|
if actualLength < n:
|
||||||
|
return str(n)
|
||||||
|
|
||||||
|
return "X"
|
||||||
|
|
||||||
|
|
||||||
|
def insideParenthesis(token, tokens):
|
||||||
|
"""
|
||||||
|
Returns true if the word is inside parenthesis in the phrase.
|
||||||
|
"""
|
||||||
|
if token in ["(", ")"]:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
line = " ".join(tokens)
|
||||||
|
return re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def displayIngredient(ingredient):
|
||||||
|
"""
|
||||||
|
Format a list of (tag, [tokens]) tuples as an HTML string for display.
|
||||||
|
displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
|
||||||
|
# => <span class='qty'>1</span> <span class='name'>cat pie</span>
|
||||||
|
"""
|
||||||
|
|
||||||
|
return "".join(["<span class='%s'>%s</span>" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
|
||||||
|
|
||||||
|
|
||||||
|
# HACK: fix this
|
||||||
|
def smartJoin(words):
|
||||||
|
"""
|
||||||
|
Joins list of words with spaces, but is smart about not adding spaces
|
||||||
|
before commas.
|
||||||
|
"""
|
||||||
|
|
||||||
|
input = " ".join(words)
|
||||||
|
|
||||||
|
# replace " , " with ", "
|
||||||
|
input = input.replace(" , ", ", ")
|
||||||
|
|
||||||
|
# replace " ( " with " ("
|
||||||
|
input = input.replace("( ", "(")
|
||||||
|
|
||||||
|
# replace " ) " with ") "
|
||||||
|
input = input.replace(" )", ")")
|
||||||
|
|
||||||
|
return input
|
||||||
|
|
||||||
|
|
||||||
|
def import_data(lines):
|
||||||
|
"""
|
||||||
|
This thing takes the output of CRF++ and turns it into an actual
|
||||||
|
data structure.
|
||||||
|
"""
|
||||||
|
data = [{}]
|
||||||
|
display = [[]]
|
||||||
|
prevTag = None
|
||||||
|
#
|
||||||
|
# iterate lines in the data file, which looks like:
|
||||||
|
#
|
||||||
|
# # 0.511035
|
||||||
|
# 1/2 I1 L12 NoCAP X B-QTY/0.982850
|
||||||
|
# teaspoon I2 L12 NoCAP X B-UNIT/0.982200
|
||||||
|
# fresh I3 L12 NoCAP X B-COMMENT/0.716364
|
||||||
|
# thyme I4 L12 NoCAP X B-NAME/0.816803
|
||||||
|
# leaves I5 L12 NoCAP X I-NAME/0.960524
|
||||||
|
# , I6 L12 NoCAP X B-COMMENT/0.772231
|
||||||
|
# finely I7 L12 NoCAP X I-COMMENT/0.825956
|
||||||
|
# chopped I8 L12 NoCAP X I-COMMENT/0.893379
|
||||||
|
#
|
||||||
|
# # 0.505999
|
||||||
|
# Black I1 L8 YesCAP X B-NAME/0.765461
|
||||||
|
# pepper I2 L8 NoCAP X I-NAME/0.756614
|
||||||
|
# , I3 L8 NoCAP X OTHER/0.798040
|
||||||
|
# to I4 L8 NoCAP X B-COMMENT/0.683089
|
||||||
|
# taste I5 L8 NoCAP X I-COMMENT/0.848617
|
||||||
|
#
|
||||||
|
# i.e. the output of crf_test -v 1
|
||||||
|
#
|
||||||
|
for line in lines:
|
||||||
|
# blank line starts a new ingredient
|
||||||
|
if line in ("", "\n"):
|
||||||
|
data.append({})
|
||||||
|
display.append([])
|
||||||
|
prevTag = None
|
||||||
|
|
||||||
|
# ignore comments
|
||||||
|
elif line[0] == "#":
|
||||||
|
pass
|
||||||
|
|
||||||
|
# otherwise it's a token
|
||||||
|
# e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
|
||||||
|
else:
|
||||||
|
|
||||||
|
columns = re.split("\t", line.strip())
|
||||||
|
token = columns[0].strip()
|
||||||
|
|
||||||
|
# unclump fractions
|
||||||
|
token = unclump(token)
|
||||||
|
|
||||||
|
# turn B-NAME/123 back into "name"
|
||||||
|
tag, confidence = re.split(r"/", columns[-1], 1)
|
||||||
|
tag = re.sub(r"^[BI]\-", "", tag).lower()
|
||||||
|
|
||||||
|
# ---- DISPLAY ----
|
||||||
|
# build a structure which groups each token by its tag, so we can
|
||||||
|
# rebuild the original display name later.
|
||||||
|
|
||||||
|
if prevTag != tag:
|
||||||
|
display[-1].append((tag, [token]))
|
||||||
|
prevTag = tag
|
||||||
|
|
||||||
|
else:
|
||||||
|
display[-1][-1][1].append(token)
|
||||||
|
# ^- token
|
||||||
|
# ^---- tag
|
||||||
|
# ^-------- ingredient
|
||||||
|
|
||||||
|
# ---- DATA ----
|
||||||
|
# build a dict grouping tokens by their tag
|
||||||
|
|
||||||
|
# initialize this attribute if this is the first token of its kind
|
||||||
|
if tag not in data[-1]:
|
||||||
|
data[-1][tag] = []
|
||||||
|
|
||||||
|
# HACK: If this token is a unit, singularize it so Scoop accepts it.
|
||||||
|
if tag == "unit":
|
||||||
|
token = singularize(token)
|
||||||
|
|
||||||
|
data[-1][tag].append(token)
|
||||||
|
|
||||||
|
# reassemble the output into a list of dicts.
|
||||||
|
output = [
|
||||||
|
dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
|
||||||
|
]
|
||||||
|
# Add the marked-up display data
|
||||||
|
for i, v in enumerate(output):
|
||||||
|
output[i]["display"] = displayIngredient(display[i])
|
||||||
|
|
||||||
|
# Add the raw ingredient phrase
|
||||||
|
for i, v in enumerate(output):
|
||||||
|
output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def export_data(lines):
|
||||||
|
""" Parse "raw" ingredient lines into CRF-ready output """
|
||||||
|
output = []
|
||||||
|
for line in lines:
|
||||||
|
line_clean = re.sub("<[^<]+?>", "", line)
|
||||||
|
tokens = tokenizer.tokenize(line_clean)
|
||||||
|
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
features = getFeatures(token, i + 1, tokens)
|
||||||
|
output.append(joinLine([token] + features))
|
||||||
|
output.append("")
|
||||||
|
return "\n".join(output)
|
Loading…
x
Reference in New Issue
Block a user