mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-07-09 03:04:54 -04:00
feat(backend): ✨ Add NLP Endpoint for Ingredient Parser (WIP)
This commit is contained in:
parent
161618808e
commit
20d847ec8e
3
.gitignore
vendored
3
.gitignore
vendored
@ -149,4 +149,5 @@ dev/data/backups/dev_sample_data*.zip
|
||||
!dev/data/backups/test*.zip
|
||||
dev/data/recipes/*
|
||||
dev/scripts/output/app_routes.py
|
||||
dev/scripts/output/javascriptAPI/*
|
||||
dev/scripts/output/javascriptAPI/*
|
||||
mealie/services/scraper/ingredient_nlp/model.crfmodel
|
||||
|
@ -1,5 +1,5 @@
|
||||
from fastapi import APIRouter
|
||||
from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, recipe_crud_routes
|
||||
from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, ingredient_parser, recipe_crud_routes
|
||||
|
||||
prefix = "/recipes"
|
||||
|
||||
@ -10,3 +10,4 @@ router.include_router(recipe_crud_routes.user_router, prefix=prefix, tags=["Reci
|
||||
router.include_router(recipe_crud_routes.public_router, prefix=prefix, tags=["Recipe: CRUD"])
|
||||
router.include_router(image_and_assets.user_router, prefix=prefix, tags=["Recipe: Images and Assets"])
|
||||
router.include_router(comments.router, prefix=prefix, tags=["Recipe: Comments"])
|
||||
router.include_router(ingredient_parser.public_router, tags=["Recipe: Ingredient Parser"])
|
||||
|
24
mealie/routes/recipe/ingredient_parser.py
Normal file
24
mealie/routes/recipe/ingredient_parser.py
Normal file
@ -0,0 +1,24 @@
|
||||
from fastapi import APIRouter
|
||||
from mealie.services.scraper.ingredient_nlp.processor import (
|
||||
convert_crf_models_to_ingredients,
|
||||
convert_list_to_crf_model,
|
||||
)
|
||||
from pydantic import BaseModel
|
||||
|
||||
public_router = APIRouter()
|
||||
|
||||
|
||||
class IngredientRequest(BaseModel):
|
||||
ingredients: list[str]
|
||||
|
||||
|
||||
@public_router.post("/parse/ingredient")
|
||||
def parse_ingredients(ingredients: IngredientRequest):
|
||||
"""
|
||||
Parse an ingredient string.
|
||||
"""
|
||||
|
||||
crf_models = convert_list_to_crf_model(ingredients.ingredients)
|
||||
ingredients = convert_crf_models_to_ingredients(crf_models)
|
||||
|
||||
return {"ingredient": ingredients}
|
0
mealie/services/scraper/ingredient_nlp/__init__.py
Normal file
0
mealie/services/scraper/ingredient_nlp/__init__.py
Normal file
74
mealie/services/scraper/ingredient_nlp/processor.py
Normal file
74
mealie/services/scraper/ingredient_nlp/processor.py
Normal file
@ -0,0 +1,74 @@
|
||||
import subprocess
|
||||
import tempfile
|
||||
from fractions import Fraction
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from mealie.core.config import settings
|
||||
from mealie.schema.recipe import RecipeIngredient
|
||||
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
|
||||
from pydantic import BaseModel
|
||||
|
||||
from . import utils
|
||||
|
||||
CWD = Path(__file__).parent
|
||||
MODEL_PATH = CWD / "model.crfmodel"
|
||||
|
||||
INGREDIENT_TEXT = [
|
||||
"2 tablespoons honey",
|
||||
"1/2 cup flour",
|
||||
"Black pepper, to taste",
|
||||
"2 cups of garlic finely chopped",
|
||||
"2 liters whole milk",
|
||||
]
|
||||
|
||||
|
||||
class CRFIngredient(BaseModel):
|
||||
input: Optional[str] = ""
|
||||
name: Optional[str] = ""
|
||||
other: Optional[str] = ""
|
||||
qty: Optional[str] = ""
|
||||
comment: Optional[str] = ""
|
||||
unit: Optional[str] = ""
|
||||
|
||||
|
||||
def _exec_crf_test(input_text):
|
||||
with tempfile.NamedTemporaryFile(mode="w") as input_file:
|
||||
input_file.write(utils.export_data(input_text))
|
||||
input_file.flush()
|
||||
return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
|
||||
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
|
||||
crf_output = _exec_crf_test(list_of_ingrdeint_text)
|
||||
|
||||
crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
|
||||
|
||||
for model in crf_models:
|
||||
print(model)
|
||||
|
||||
return crf_models
|
||||
|
||||
|
||||
def convert_crf_models_to_ingredients(crf_models: list[CRFIngredient]):
|
||||
return [
|
||||
RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.comment,
|
||||
unit=CreateIngredientUnit(name=crf_model.unit),
|
||||
food=CreateIngredientFood(name=crf_model.name),
|
||||
disable_amount=settings.RECIPE_DISABLE_AMOUNT,
|
||||
quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
|
||||
)
|
||||
for crf_model in crf_models
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
crf_models = convert_list_to_crf_model(INGREDIENT_TEXT)
|
||||
ingredients = convert_crf_models_to_ingredients(crf_models)
|
||||
|
||||
for ingredient in ingredients:
|
||||
print(ingredient)
|
37
mealie/services/scraper/ingredient_nlp/tokenizer.py
Normal file
37
mealie/services/scraper/ingredient_nlp/tokenizer.py
Normal file
@ -0,0 +1,37 @@
|
||||
import re
|
||||
|
||||
|
||||
def clumpFractions(s):
|
||||
"""
|
||||
Replaces the whitespace between the integer and fractional part of a quantity
|
||||
with a dollar sign, so it's interpreted as a single token. The rest of the
|
||||
string is left alone.
|
||||
clumpFractions("aaa 1 2/3 bbb")
|
||||
# => "aaa 1$2/3 bbb"
|
||||
"""
|
||||
|
||||
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
"""
|
||||
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
|
||||
We sometimes give American units and metric units for baking recipes. For example:
|
||||
* 2 tablespoons/30 mililiters milk or cream
|
||||
* 2 1/2 cups/300 grams all-purpose flour
|
||||
The recipe database only allows for one unit, and we want to use the American one.
|
||||
But we must split the text on "cups/" etc. in order to pick it up.
|
||||
"""
|
||||
|
||||
# handle abbreviation like "100g" by treating it as "100 grams"
|
||||
s = re.sub(r"(\d+)g", r"\1 grams", s)
|
||||
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
|
||||
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
|
||||
|
||||
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
|
||||
# The following removes slashes following American units and replaces it with a space.
|
||||
for unit in american_units:
|
||||
s = s.replace(unit + "/", unit + " ")
|
||||
s = s.replace(unit + "s/", unit + "s ")
|
||||
|
||||
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
|
278
mealie/services/scraper/ingredient_nlp/utils.py
Normal file
278
mealie/services/scraper/ingredient_nlp/utils.py
Normal file
@ -0,0 +1,278 @@
|
||||
import re
|
||||
|
||||
from . import tokenizer
|
||||
|
||||
|
||||
def joinLine(columns):
|
||||
return "\t".join(columns)
|
||||
|
||||
|
||||
def cleanUnicodeFractions(s):
|
||||
"""
|
||||
Replace unicode fractions with ascii representation, preceded by a
|
||||
space.
|
||||
"1\x215e" => "1 7/8"
|
||||
"""
|
||||
|
||||
fractions = {
|
||||
"\x215b": "1/8",
|
||||
"\x215c": "3/8",
|
||||
"\x215d": "5/8",
|
||||
"\x215e": "7/8",
|
||||
"\x2159": "1/6",
|
||||
"\x215a": "5/6",
|
||||
"\x2155": "1/5",
|
||||
"\x2156": "2/5",
|
||||
"\x2157": "3/5",
|
||||
"\x2158": "4/5",
|
||||
"\xbc": " 1/4",
|
||||
"\xbe": "3/4",
|
||||
"\x2153": "1/3",
|
||||
"\x2154": "2/3",
|
||||
"\xbd": "1/2",
|
||||
}
|
||||
|
||||
for f_unicode, f_ascii in fractions.items():
|
||||
s = s.replace(f_unicode, " " + f_ascii)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def unclump(s):
|
||||
"""
|
||||
Replacess $'s with spaces. The reverse of clumpFractions.
|
||||
"""
|
||||
return re.sub(r"\$", " ", s)
|
||||
|
||||
|
||||
def normalizeToken(s):
|
||||
"""
|
||||
ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
|
||||
in the name of simple deployments, we took it out. We should fix this at some
|
||||
point.
|
||||
"""
|
||||
return singularize(s)
|
||||
|
||||
|
||||
def getFeatures(token, index, tokens):
|
||||
"""
|
||||
Returns a list of features for a given token.
|
||||
"""
|
||||
length = len(tokens)
|
||||
|
||||
return [
|
||||
("I%s" % index),
|
||||
("L%s" % lengthGroup(length)),
|
||||
("Yes" if isCapitalized(token) else "No") + "CAP",
|
||||
("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
|
||||
]
|
||||
|
||||
|
||||
def singularize(word):
|
||||
"""
|
||||
A poor replacement for the pattern.en singularize function, but ok for now.
|
||||
"""
|
||||
|
||||
units = {
|
||||
"cups": "cup",
|
||||
"tablespoons": "tablespoon",
|
||||
"teaspoons": "teaspoon",
|
||||
"pounds": "pound",
|
||||
"ounces": "ounce",
|
||||
"cloves": "clove",
|
||||
"sprigs": "sprig",
|
||||
"pinches": "pinch",
|
||||
"bunches": "bunch",
|
||||
"slices": "slice",
|
||||
"grams": "gram",
|
||||
"heads": "head",
|
||||
"quarts": "quart",
|
||||
"stalks": "stalk",
|
||||
"pints": "pint",
|
||||
"pieces": "piece",
|
||||
"sticks": "stick",
|
||||
"dashes": "dash",
|
||||
"fillets": "fillet",
|
||||
"cans": "can",
|
||||
"ears": "ear",
|
||||
"packages": "package",
|
||||
"strips": "strip",
|
||||
"bulbs": "bulb",
|
||||
"bottles": "bottle",
|
||||
}
|
||||
|
||||
if word in units.keys():
|
||||
return units[word]
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def isCapitalized(token):
|
||||
"""
|
||||
Returns true if a given token starts with a capital letter.
|
||||
"""
|
||||
return re.match(r"^[A-Z]", token) is not None
|
||||
|
||||
|
||||
def lengthGroup(actualLength):
|
||||
"""
|
||||
Buckets the length of the ingredient into 6 buckets.
|
||||
"""
|
||||
for n in [4, 8, 12, 16, 20]:
|
||||
if actualLength < n:
|
||||
return str(n)
|
||||
|
||||
return "X"
|
||||
|
||||
|
||||
def insideParenthesis(token, tokens):
|
||||
"""
|
||||
Returns true if the word is inside parenthesis in the phrase.
|
||||
"""
|
||||
if token in ["(", ")"]:
|
||||
return True
|
||||
else:
|
||||
line = " ".join(tokens)
|
||||
return re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None
|
||||
|
||||
|
||||
def displayIngredient(ingredient):
|
||||
"""
|
||||
Format a list of (tag, [tokens]) tuples as an HTML string for display.
|
||||
displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
|
||||
# => <span class='qty'>1</span> <span class='name'>cat pie</span>
|
||||
"""
|
||||
|
||||
return "".join(["<span class='%s'>%s</span>" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
|
||||
|
||||
|
||||
# HACK: fix this
|
||||
def smartJoin(words):
|
||||
"""
|
||||
Joins list of words with spaces, but is smart about not adding spaces
|
||||
before commas.
|
||||
"""
|
||||
|
||||
input = " ".join(words)
|
||||
|
||||
# replace " , " with ", "
|
||||
input = input.replace(" , ", ", ")
|
||||
|
||||
# replace " ( " with " ("
|
||||
input = input.replace("( ", "(")
|
||||
|
||||
# replace " ) " with ") "
|
||||
input = input.replace(" )", ")")
|
||||
|
||||
return input
|
||||
|
||||
|
||||
def import_data(lines):
|
||||
"""
|
||||
This thing takes the output of CRF++ and turns it into an actual
|
||||
data structure.
|
||||
"""
|
||||
data = [{}]
|
||||
display = [[]]
|
||||
prevTag = None
|
||||
#
|
||||
# iterate lines in the data file, which looks like:
|
||||
#
|
||||
# # 0.511035
|
||||
# 1/2 I1 L12 NoCAP X B-QTY/0.982850
|
||||
# teaspoon I2 L12 NoCAP X B-UNIT/0.982200
|
||||
# fresh I3 L12 NoCAP X B-COMMENT/0.716364
|
||||
# thyme I4 L12 NoCAP X B-NAME/0.816803
|
||||
# leaves I5 L12 NoCAP X I-NAME/0.960524
|
||||
# , I6 L12 NoCAP X B-COMMENT/0.772231
|
||||
# finely I7 L12 NoCAP X I-COMMENT/0.825956
|
||||
# chopped I8 L12 NoCAP X I-COMMENT/0.893379
|
||||
#
|
||||
# # 0.505999
|
||||
# Black I1 L8 YesCAP X B-NAME/0.765461
|
||||
# pepper I2 L8 NoCAP X I-NAME/0.756614
|
||||
# , I3 L8 NoCAP X OTHER/0.798040
|
||||
# to I4 L8 NoCAP X B-COMMENT/0.683089
|
||||
# taste I5 L8 NoCAP X I-COMMENT/0.848617
|
||||
#
|
||||
# i.e. the output of crf_test -v 1
|
||||
#
|
||||
for line in lines:
|
||||
# blank line starts a new ingredient
|
||||
if line in ("", "\n"):
|
||||
data.append({})
|
||||
display.append([])
|
||||
prevTag = None
|
||||
|
||||
# ignore comments
|
||||
elif line[0] == "#":
|
||||
pass
|
||||
|
||||
# otherwise it's a token
|
||||
# e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
|
||||
else:
|
||||
|
||||
columns = re.split("\t", line.strip())
|
||||
token = columns[0].strip()
|
||||
|
||||
# unclump fractions
|
||||
token = unclump(token)
|
||||
|
||||
# turn B-NAME/123 back into "name"
|
||||
tag, confidence = re.split(r"/", columns[-1], 1)
|
||||
tag = re.sub(r"^[BI]\-", "", tag).lower()
|
||||
|
||||
# ---- DISPLAY ----
|
||||
# build a structure which groups each token by its tag, so we can
|
||||
# rebuild the original display name later.
|
||||
|
||||
if prevTag != tag:
|
||||
display[-1].append((tag, [token]))
|
||||
prevTag = tag
|
||||
|
||||
else:
|
||||
display[-1][-1][1].append(token)
|
||||
# ^- token
|
||||
# ^---- tag
|
||||
# ^-------- ingredient
|
||||
|
||||
# ---- DATA ----
|
||||
# build a dict grouping tokens by their tag
|
||||
|
||||
# initialize this attribute if this is the first token of its kind
|
||||
if tag not in data[-1]:
|
||||
data[-1][tag] = []
|
||||
|
||||
# HACK: If this token is a unit, singularize it so Scoop accepts it.
|
||||
if tag == "unit":
|
||||
token = singularize(token)
|
||||
|
||||
data[-1][tag].append(token)
|
||||
|
||||
# reassemble the output into a list of dicts.
|
||||
output = [
|
||||
dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
|
||||
]
|
||||
# Add the marked-up display data
|
||||
for i, v in enumerate(output):
|
||||
output[i]["display"] = displayIngredient(display[i])
|
||||
|
||||
# Add the raw ingredient phrase
|
||||
for i, v in enumerate(output):
|
||||
output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def export_data(lines):
|
||||
""" Parse "raw" ingredient lines into CRF-ready output """
|
||||
output = []
|
||||
for line in lines:
|
||||
line_clean = re.sub("<[^<]+?>", "", line)
|
||||
tokens = tokenizer.tokenize(line_clean)
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
features = getFeatures(token, i + 1, tokens)
|
||||
output.append(joinLine([token] + features))
|
||||
output.append("")
|
||||
return "\n".join(output)
|
Loading…
x
Reference in New Issue
Block a user