diff --git a/.gitignore b/.gitignore index 34ea61503ad6..8818df6e4673 100644 --- a/.gitignore +++ b/.gitignore @@ -149,4 +149,5 @@ dev/data/backups/dev_sample_data*.zip !dev/data/backups/test*.zip dev/data/recipes/* dev/scripts/output/app_routes.py -dev/scripts/output/javascriptAPI/* \ No newline at end of file +dev/scripts/output/javascriptAPI/* +mealie/services/scraper/ingredient_nlp/model.crfmodel diff --git a/mealie/routes/recipe/__init__.py b/mealie/routes/recipe/__init__.py index b7b49093f56c..36d6b91553e0 100644 --- a/mealie/routes/recipe/__init__.py +++ b/mealie/routes/recipe/__init__.py @@ -1,5 +1,5 @@ from fastapi import APIRouter -from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, recipe_crud_routes +from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, ingredient_parser, recipe_crud_routes prefix = "/recipes" @@ -10,3 +10,4 @@ router.include_router(recipe_crud_routes.user_router, prefix=prefix, tags=["Reci router.include_router(recipe_crud_routes.public_router, prefix=prefix, tags=["Recipe: CRUD"]) router.include_router(image_and_assets.user_router, prefix=prefix, tags=["Recipe: Images and Assets"]) router.include_router(comments.router, prefix=prefix, tags=["Recipe: Comments"]) +router.include_router(ingredient_parser.public_router, tags=["Recipe: Ingredient Parser"]) diff --git a/mealie/routes/recipe/ingredient_parser.py b/mealie/routes/recipe/ingredient_parser.py new file mode 100644 index 000000000000..376053953e18 --- /dev/null +++ b/mealie/routes/recipe/ingredient_parser.py @@ -0,0 +1,24 @@ +from fastapi import APIRouter +from mealie.services.scraper.ingredient_nlp.processor import ( + convert_crf_models_to_ingredients, + convert_list_to_crf_model, +) +from pydantic import BaseModel + +public_router = APIRouter() + + +class IngredientRequest(BaseModel): + ingredients: list[str] + + +@public_router.post("/parse/ingredient") +def parse_ingredients(ingredients: IngredientRequest): + """ + Parse an ingredient string. + """ + + crf_models = convert_list_to_crf_model(ingredients.ingredients) + ingredients = convert_crf_models_to_ingredients(crf_models) + + return {"ingredient": ingredients} diff --git a/mealie/services/scraper/ingredient_nlp/__init__.py b/mealie/services/scraper/ingredient_nlp/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/mealie/services/scraper/ingredient_nlp/processor.py b/mealie/services/scraper/ingredient_nlp/processor.py new file mode 100644 index 000000000000..e3405efd0699 --- /dev/null +++ b/mealie/services/scraper/ingredient_nlp/processor.py @@ -0,0 +1,74 @@ +import subprocess +import tempfile +from fractions import Fraction +from pathlib import Path +from typing import Optional + +from mealie.core.config import settings +from mealie.schema.recipe import RecipeIngredient +from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit +from pydantic import BaseModel + +from . import utils + +CWD = Path(__file__).parent +MODEL_PATH = CWD / "model.crfmodel" + +INGREDIENT_TEXT = [ + "2 tablespoons honey", + "1/2 cup flour", + "Black pepper, to taste", + "2 cups of garlic finely chopped", + "2 liters whole milk", +] + + +class CRFIngredient(BaseModel): + input: Optional[str] = "" + name: Optional[str] = "" + other: Optional[str] = "" + qty: Optional[str] = "" + comment: Optional[str] = "" + unit: Optional[str] = "" + + +def _exec_crf_test(input_text): + with tempfile.NamedTemporaryFile(mode="w") as input_file: + input_file.write(utils.export_data(input_text)) + input_file.flush() + return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode( + "utf-8" + ) + + +def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]): + crf_output = _exec_crf_test(list_of_ingrdeint_text) + + crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))] + + for model in crf_models: + print(model) + + return crf_models + + +def convert_crf_models_to_ingredients(crf_models: list[CRFIngredient]): + return [ + RecipeIngredient( + title="", + note=crf_model.comment, + unit=CreateIngredientUnit(name=crf_model.unit), + food=CreateIngredientFood(name=crf_model.name), + disable_amount=settings.RECIPE_DISABLE_AMOUNT, + quantity=float(sum(Fraction(s) for s in crf_model.qty.split())), + ) + for crf_model in crf_models + ] + + +if __name__ == "__main__": + crf_models = convert_list_to_crf_model(INGREDIENT_TEXT) + ingredients = convert_crf_models_to_ingredients(crf_models) + + for ingredient in ingredients: + print(ingredient) diff --git a/mealie/services/scraper/ingredient_nlp/tokenizer.py b/mealie/services/scraper/ingredient_nlp/tokenizer.py new file mode 100644 index 000000000000..4973d388a692 --- /dev/null +++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py @@ -0,0 +1,37 @@ +import re + + +def clumpFractions(s): + """ + Replaces the whitespace between the integer and fractional part of a quantity + with a dollar sign, so it's interpreted as a single token. The rest of the + string is left alone. + clumpFractions("aaa 1 2/3 bbb") + # => "aaa 1$2/3 bbb" + """ + + return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s) + + +def tokenize(s): + """ + Tokenize on parenthesis, punctuation, spaces and American units followed by a slash. + We sometimes give American units and metric units for baking recipes. For example: + * 2 tablespoons/30 mililiters milk or cream + * 2 1/2 cups/300 grams all-purpose flour + The recipe database only allows for one unit, and we want to use the American one. + But we must split the text on "cups/" etc. in order to pick it up. + """ + + # handle abbreviation like "100g" by treating it as "100 grams" + s = re.sub(r"(\d+)g", r"\1 grams", s) + s = re.sub(r"(\d+)oz", r"\1 ounces", s) + s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE) + + american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"] + # The following removes slashes following American units and replaces it with a space. + for unit in american_units: + s = s.replace(unit + "/", unit + " ") + s = s.replace(unit + "s/", unit + "s ") + + return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()] diff --git a/mealie/services/scraper/ingredient_nlp/utils.py b/mealie/services/scraper/ingredient_nlp/utils.py new file mode 100644 index 000000000000..f573ea93a8c7 --- /dev/null +++ b/mealie/services/scraper/ingredient_nlp/utils.py @@ -0,0 +1,278 @@ +import re + +from . import tokenizer + + +def joinLine(columns): + return "\t".join(columns) + + +def cleanUnicodeFractions(s): + """ + Replace unicode fractions with ascii representation, preceded by a + space. + "1\x215e" => "1 7/8" + """ + + fractions = { + "\x215b": "1/8", + "\x215c": "3/8", + "\x215d": "5/8", + "\x215e": "7/8", + "\x2159": "1/6", + "\x215a": "5/6", + "\x2155": "1/5", + "\x2156": "2/5", + "\x2157": "3/5", + "\x2158": "4/5", + "\xbc": " 1/4", + "\xbe": "3/4", + "\x2153": "1/3", + "\x2154": "2/3", + "\xbd": "1/2", + } + + for f_unicode, f_ascii in fractions.items(): + s = s.replace(f_unicode, " " + f_ascii) + + return s + + +def unclump(s): + """ + Replacess $'s with spaces. The reverse of clumpFractions. + """ + return re.sub(r"\$", " ", s) + + +def normalizeToken(s): + """ + ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but + in the name of simple deployments, we took it out. We should fix this at some + point. + """ + return singularize(s) + + +def getFeatures(token, index, tokens): + """ + Returns a list of features for a given token. + """ + length = len(tokens) + + return [ + ("I%s" % index), + ("L%s" % lengthGroup(length)), + ("Yes" if isCapitalized(token) else "No") + "CAP", + ("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN", + ] + + +def singularize(word): + """ + A poor replacement for the pattern.en singularize function, but ok for now. + """ + + units = { + "cups": "cup", + "tablespoons": "tablespoon", + "teaspoons": "teaspoon", + "pounds": "pound", + "ounces": "ounce", + "cloves": "clove", + "sprigs": "sprig", + "pinches": "pinch", + "bunches": "bunch", + "slices": "slice", + "grams": "gram", + "heads": "head", + "quarts": "quart", + "stalks": "stalk", + "pints": "pint", + "pieces": "piece", + "sticks": "stick", + "dashes": "dash", + "fillets": "fillet", + "cans": "can", + "ears": "ear", + "packages": "package", + "strips": "strip", + "bulbs": "bulb", + "bottles": "bottle", + } + + if word in units.keys(): + return units[word] + else: + return word + + +def isCapitalized(token): + """ + Returns true if a given token starts with a capital letter. + """ + return re.match(r"^[A-Z]", token) is not None + + +def lengthGroup(actualLength): + """ + Buckets the length of the ingredient into 6 buckets. + """ + for n in [4, 8, 12, 16, 20]: + if actualLength < n: + return str(n) + + return "X" + + +def insideParenthesis(token, tokens): + """ + Returns true if the word is inside parenthesis in the phrase. + """ + if token in ["(", ")"]: + return True + else: + line = " ".join(tokens) + return re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None + + +def displayIngredient(ingredient): + """ + Format a list of (tag, [tokens]) tuples as an HTML string for display. + displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])]) + # => 1 cat pie + """ + + return "".join(["%s" % (tag, " ".join(tokens)) for tag, tokens in ingredient]) + + +# HACK: fix this +def smartJoin(words): + """ + Joins list of words with spaces, but is smart about not adding spaces + before commas. + """ + + input = " ".join(words) + + # replace " , " with ", " + input = input.replace(" , ", ", ") + + # replace " ( " with " (" + input = input.replace("( ", "(") + + # replace " ) " with ") " + input = input.replace(" )", ")") + + return input + + +def import_data(lines): + """ + This thing takes the output of CRF++ and turns it into an actual + data structure. + """ + data = [{}] + display = [[]] + prevTag = None + # + # iterate lines in the data file, which looks like: + # + # # 0.511035 + # 1/2 I1 L12 NoCAP X B-QTY/0.982850 + # teaspoon I2 L12 NoCAP X B-UNIT/0.982200 + # fresh I3 L12 NoCAP X B-COMMENT/0.716364 + # thyme I4 L12 NoCAP X B-NAME/0.816803 + # leaves I5 L12 NoCAP X I-NAME/0.960524 + # , I6 L12 NoCAP X B-COMMENT/0.772231 + # finely I7 L12 NoCAP X I-COMMENT/0.825956 + # chopped I8 L12 NoCAP X I-COMMENT/0.893379 + # + # # 0.505999 + # Black I1 L8 YesCAP X B-NAME/0.765461 + # pepper I2 L8 NoCAP X I-NAME/0.756614 + # , I3 L8 NoCAP X OTHER/0.798040 + # to I4 L8 NoCAP X B-COMMENT/0.683089 + # taste I5 L8 NoCAP X I-COMMENT/0.848617 + # + # i.e. the output of crf_test -v 1 + # + for line in lines: + # blank line starts a new ingredient + if line in ("", "\n"): + data.append({}) + display.append([]) + prevTag = None + + # ignore comments + elif line[0] == "#": + pass + + # otherwise it's a token + # e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253 + else: + + columns = re.split("\t", line.strip()) + token = columns[0].strip() + + # unclump fractions + token = unclump(token) + + # turn B-NAME/123 back into "name" + tag, confidence = re.split(r"/", columns[-1], 1) + tag = re.sub(r"^[BI]\-", "", tag).lower() + + # ---- DISPLAY ---- + # build a structure which groups each token by its tag, so we can + # rebuild the original display name later. + + if prevTag != tag: + display[-1].append((tag, [token])) + prevTag = tag + + else: + display[-1][-1][1].append(token) + # ^- token + # ^---- tag + # ^-------- ingredient + + # ---- DATA ---- + # build a dict grouping tokens by their tag + + # initialize this attribute if this is the first token of its kind + if tag not in data[-1]: + data[-1][tag] = [] + + # HACK: If this token is a unit, singularize it so Scoop accepts it. + if tag == "unit": + token = singularize(token) + + data[-1][tag].append(token) + + # reassemble the output into a list of dicts. + output = [ + dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient) + ] + # Add the marked-up display data + for i, v in enumerate(output): + output[i]["display"] = displayIngredient(display[i]) + + # Add the raw ingredient phrase + for i, v in enumerate(output): + output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]]) + + return output + + +def export_data(lines): + """ Parse "raw" ingredient lines into CRF-ready output """ + output = [] + for line in lines: + line_clean = re.sub("<[^<]+?>", "", line) + tokens = tokenizer.tokenize(line_clean) + + for i, token in enumerate(tokens): + features = getFeatures(token, i + 1, tokens) + output.append(joinLine([token] + features)) + output.append("") + return "\n".join(output)