diff --git a/.gitignore b/.gitignore
index 34ea61503ad6..8818df6e4673 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,4 +149,5 @@ dev/data/backups/dev_sample_data*.zip
!dev/data/backups/test*.zip
dev/data/recipes/*
dev/scripts/output/app_routes.py
-dev/scripts/output/javascriptAPI/*
\ No newline at end of file
+dev/scripts/output/javascriptAPI/*
+mealie/services/scraper/ingredient_nlp/model.crfmodel
diff --git a/mealie/routes/recipe/__init__.py b/mealie/routes/recipe/__init__.py
index b7b49093f56c..36d6b91553e0 100644
--- a/mealie/routes/recipe/__init__.py
+++ b/mealie/routes/recipe/__init__.py
@@ -1,5 +1,5 @@
from fastapi import APIRouter
-from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, recipe_crud_routes
+from mealie.routes.recipe import all_recipe_routes, comments, image_and_assets, ingredient_parser, recipe_crud_routes
prefix = "/recipes"
@@ -10,3 +10,4 @@ router.include_router(recipe_crud_routes.user_router, prefix=prefix, tags=["Reci
router.include_router(recipe_crud_routes.public_router, prefix=prefix, tags=["Recipe: CRUD"])
router.include_router(image_and_assets.user_router, prefix=prefix, tags=["Recipe: Images and Assets"])
router.include_router(comments.router, prefix=prefix, tags=["Recipe: Comments"])
+router.include_router(ingredient_parser.public_router, tags=["Recipe: Ingredient Parser"])
diff --git a/mealie/routes/recipe/ingredient_parser.py b/mealie/routes/recipe/ingredient_parser.py
new file mode 100644
index 000000000000..376053953e18
--- /dev/null
+++ b/mealie/routes/recipe/ingredient_parser.py
@@ -0,0 +1,24 @@
+from fastapi import APIRouter
+from mealie.services.scraper.ingredient_nlp.processor import (
+ convert_crf_models_to_ingredients,
+ convert_list_to_crf_model,
+)
+from pydantic import BaseModel
+
+public_router = APIRouter()
+
+
+class IngredientRequest(BaseModel):
+ ingredients: list[str]
+
+
+@public_router.post("/parse/ingredient")
+def parse_ingredients(ingredients: IngredientRequest):
+ """
+ Parse an ingredient string.
+ """
+
+ crf_models = convert_list_to_crf_model(ingredients.ingredients)
+ ingredients = convert_crf_models_to_ingredients(crf_models)
+
+ return {"ingredient": ingredients}
diff --git a/mealie/services/scraper/ingredient_nlp/__init__.py b/mealie/services/scraper/ingredient_nlp/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/mealie/services/scraper/ingredient_nlp/processor.py b/mealie/services/scraper/ingredient_nlp/processor.py
new file mode 100644
index 000000000000..e3405efd0699
--- /dev/null
+++ b/mealie/services/scraper/ingredient_nlp/processor.py
@@ -0,0 +1,74 @@
+import subprocess
+import tempfile
+from fractions import Fraction
+from pathlib import Path
+from typing import Optional
+
+from mealie.core.config import settings
+from mealie.schema.recipe import RecipeIngredient
+from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
+from pydantic import BaseModel
+
+from . import utils
+
+CWD = Path(__file__).parent
+MODEL_PATH = CWD / "model.crfmodel"
+
+INGREDIENT_TEXT = [
+ "2 tablespoons honey",
+ "1/2 cup flour",
+ "Black pepper, to taste",
+ "2 cups of garlic finely chopped",
+ "2 liters whole milk",
+]
+
+
+class CRFIngredient(BaseModel):
+ input: Optional[str] = ""
+ name: Optional[str] = ""
+ other: Optional[str] = ""
+ qty: Optional[str] = ""
+ comment: Optional[str] = ""
+ unit: Optional[str] = ""
+
+
+def _exec_crf_test(input_text):
+ with tempfile.NamedTemporaryFile(mode="w") as input_file:
+ input_file.write(utils.export_data(input_text))
+ input_file.flush()
+ return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
+ "utf-8"
+ )
+
+
+def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
+ crf_output = _exec_crf_test(list_of_ingrdeint_text)
+
+ crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
+
+ for model in crf_models:
+ print(model)
+
+ return crf_models
+
+
+def convert_crf_models_to_ingredients(crf_models: list[CRFIngredient]):
+ return [
+ RecipeIngredient(
+ title="",
+ note=crf_model.comment,
+ unit=CreateIngredientUnit(name=crf_model.unit),
+ food=CreateIngredientFood(name=crf_model.name),
+ disable_amount=settings.RECIPE_DISABLE_AMOUNT,
+ quantity=float(sum(Fraction(s) for s in crf_model.qty.split())),
+ )
+ for crf_model in crf_models
+ ]
+
+
+if __name__ == "__main__":
+ crf_models = convert_list_to_crf_model(INGREDIENT_TEXT)
+ ingredients = convert_crf_models_to_ingredients(crf_models)
+
+ for ingredient in ingredients:
+ print(ingredient)
diff --git a/mealie/services/scraper/ingredient_nlp/tokenizer.py b/mealie/services/scraper/ingredient_nlp/tokenizer.py
new file mode 100644
index 000000000000..4973d388a692
--- /dev/null
+++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py
@@ -0,0 +1,37 @@
+import re
+
+
+def clumpFractions(s):
+ """
+ Replaces the whitespace between the integer and fractional part of a quantity
+ with a dollar sign, so it's interpreted as a single token. The rest of the
+ string is left alone.
+ clumpFractions("aaa 1 2/3 bbb")
+ # => "aaa 1$2/3 bbb"
+ """
+
+ return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
+
+
+def tokenize(s):
+ """
+ Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
+ We sometimes give American units and metric units for baking recipes. For example:
+ * 2 tablespoons/30 mililiters milk or cream
+ * 2 1/2 cups/300 grams all-purpose flour
+ The recipe database only allows for one unit, and we want to use the American one.
+ But we must split the text on "cups/" etc. in order to pick it up.
+ """
+
+ # handle abbreviation like "100g" by treating it as "100 grams"
+ s = re.sub(r"(\d+)g", r"\1 grams", s)
+ s = re.sub(r"(\d+)oz", r"\1 ounces", s)
+ s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
+
+ american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
+ # The following removes slashes following American units and replaces it with a space.
+ for unit in american_units:
+ s = s.replace(unit + "/", unit + " ")
+ s = s.replace(unit + "s/", unit + "s ")
+
+ return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
diff --git a/mealie/services/scraper/ingredient_nlp/utils.py b/mealie/services/scraper/ingredient_nlp/utils.py
new file mode 100644
index 000000000000..f573ea93a8c7
--- /dev/null
+++ b/mealie/services/scraper/ingredient_nlp/utils.py
@@ -0,0 +1,278 @@
+import re
+
+from . import tokenizer
+
+
+def joinLine(columns):
+ return "\t".join(columns)
+
+
+def cleanUnicodeFractions(s):
+ """
+ Replace unicode fractions with ascii representation, preceded by a
+ space.
+ "1\x215e" => "1 7/8"
+ """
+
+ fractions = {
+ "\x215b": "1/8",
+ "\x215c": "3/8",
+ "\x215d": "5/8",
+ "\x215e": "7/8",
+ "\x2159": "1/6",
+ "\x215a": "5/6",
+ "\x2155": "1/5",
+ "\x2156": "2/5",
+ "\x2157": "3/5",
+ "\x2158": "4/5",
+ "\xbc": " 1/4",
+ "\xbe": "3/4",
+ "\x2153": "1/3",
+ "\x2154": "2/3",
+ "\xbd": "1/2",
+ }
+
+ for f_unicode, f_ascii in fractions.items():
+ s = s.replace(f_unicode, " " + f_ascii)
+
+ return s
+
+
+def unclump(s):
+ """
+ Replacess $'s with spaces. The reverse of clumpFractions.
+ """
+ return re.sub(r"\$", " ", s)
+
+
+def normalizeToken(s):
+ """
+ ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
+ in the name of simple deployments, we took it out. We should fix this at some
+ point.
+ """
+ return singularize(s)
+
+
+def getFeatures(token, index, tokens):
+ """
+ Returns a list of features for a given token.
+ """
+ length = len(tokens)
+
+ return [
+ ("I%s" % index),
+ ("L%s" % lengthGroup(length)),
+ ("Yes" if isCapitalized(token) else "No") + "CAP",
+ ("Yes" if insideParenthesis(token, tokens) else "No") + "PAREN",
+ ]
+
+
+def singularize(word):
+ """
+ A poor replacement for the pattern.en singularize function, but ok for now.
+ """
+
+ units = {
+ "cups": "cup",
+ "tablespoons": "tablespoon",
+ "teaspoons": "teaspoon",
+ "pounds": "pound",
+ "ounces": "ounce",
+ "cloves": "clove",
+ "sprigs": "sprig",
+ "pinches": "pinch",
+ "bunches": "bunch",
+ "slices": "slice",
+ "grams": "gram",
+ "heads": "head",
+ "quarts": "quart",
+ "stalks": "stalk",
+ "pints": "pint",
+ "pieces": "piece",
+ "sticks": "stick",
+ "dashes": "dash",
+ "fillets": "fillet",
+ "cans": "can",
+ "ears": "ear",
+ "packages": "package",
+ "strips": "strip",
+ "bulbs": "bulb",
+ "bottles": "bottle",
+ }
+
+ if word in units.keys():
+ return units[word]
+ else:
+ return word
+
+
+def isCapitalized(token):
+ """
+ Returns true if a given token starts with a capital letter.
+ """
+ return re.match(r"^[A-Z]", token) is not None
+
+
+def lengthGroup(actualLength):
+ """
+ Buckets the length of the ingredient into 6 buckets.
+ """
+ for n in [4, 8, 12, 16, 20]:
+ if actualLength < n:
+ return str(n)
+
+ return "X"
+
+
+def insideParenthesis(token, tokens):
+ """
+ Returns true if the word is inside parenthesis in the phrase.
+ """
+ if token in ["(", ")"]:
+ return True
+ else:
+ line = " ".join(tokens)
+ return re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None
+
+
+def displayIngredient(ingredient):
+ """
+ Format a list of (tag, [tokens]) tuples as an HTML string for display.
+ displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
+ # => 1 cat pie
+ """
+
+ return "".join(["%s" % (tag, " ".join(tokens)) for tag, tokens in ingredient])
+
+
+# HACK: fix this
+def smartJoin(words):
+ """
+ Joins list of words with spaces, but is smart about not adding spaces
+ before commas.
+ """
+
+ input = " ".join(words)
+
+ # replace " , " with ", "
+ input = input.replace(" , ", ", ")
+
+ # replace " ( " with " ("
+ input = input.replace("( ", "(")
+
+ # replace " ) " with ") "
+ input = input.replace(" )", ")")
+
+ return input
+
+
+def import_data(lines):
+ """
+ This thing takes the output of CRF++ and turns it into an actual
+ data structure.
+ """
+ data = [{}]
+ display = [[]]
+ prevTag = None
+ #
+ # iterate lines in the data file, which looks like:
+ #
+ # # 0.511035
+ # 1/2 I1 L12 NoCAP X B-QTY/0.982850
+ # teaspoon I2 L12 NoCAP X B-UNIT/0.982200
+ # fresh I3 L12 NoCAP X B-COMMENT/0.716364
+ # thyme I4 L12 NoCAP X B-NAME/0.816803
+ # leaves I5 L12 NoCAP X I-NAME/0.960524
+ # , I6 L12 NoCAP X B-COMMENT/0.772231
+ # finely I7 L12 NoCAP X I-COMMENT/0.825956
+ # chopped I8 L12 NoCAP X I-COMMENT/0.893379
+ #
+ # # 0.505999
+ # Black I1 L8 YesCAP X B-NAME/0.765461
+ # pepper I2 L8 NoCAP X I-NAME/0.756614
+ # , I3 L8 NoCAP X OTHER/0.798040
+ # to I4 L8 NoCAP X B-COMMENT/0.683089
+ # taste I5 L8 NoCAP X I-COMMENT/0.848617
+ #
+ # i.e. the output of crf_test -v 1
+ #
+ for line in lines:
+ # blank line starts a new ingredient
+ if line in ("", "\n"):
+ data.append({})
+ display.append([])
+ prevTag = None
+
+ # ignore comments
+ elif line[0] == "#":
+ pass
+
+ # otherwise it's a token
+ # e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
+ else:
+
+ columns = re.split("\t", line.strip())
+ token = columns[0].strip()
+
+ # unclump fractions
+ token = unclump(token)
+
+ # turn B-NAME/123 back into "name"
+ tag, confidence = re.split(r"/", columns[-1], 1)
+ tag = re.sub(r"^[BI]\-", "", tag).lower()
+
+ # ---- DISPLAY ----
+ # build a structure which groups each token by its tag, so we can
+ # rebuild the original display name later.
+
+ if prevTag != tag:
+ display[-1].append((tag, [token]))
+ prevTag = tag
+
+ else:
+ display[-1][-1][1].append(token)
+ # ^- token
+ # ^---- tag
+ # ^-------- ingredient
+
+ # ---- DATA ----
+ # build a dict grouping tokens by their tag
+
+ # initialize this attribute if this is the first token of its kind
+ if tag not in data[-1]:
+ data[-1][tag] = []
+
+ # HACK: If this token is a unit, singularize it so Scoop accepts it.
+ if tag == "unit":
+ token = singularize(token)
+
+ data[-1][tag].append(token)
+
+ # reassemble the output into a list of dicts.
+ output = [
+ dict([(k, smartJoin(tokens)) for k, tokens in ingredient.items()]) for ingredient in data if len(ingredient)
+ ]
+ # Add the marked-up display data
+ for i, v in enumerate(output):
+ output[i]["display"] = displayIngredient(display[i])
+
+ # Add the raw ingredient phrase
+ for i, v in enumerate(output):
+ output[i]["input"] = smartJoin([" ".join(tokens) for k, tokens in display[i]])
+
+ return output
+
+
+def export_data(lines):
+ """ Parse "raw" ingredient lines into CRF-ready output """
+ output = []
+ for line in lines:
+ line_clean = re.sub("<[^<]+?>", "", line)
+ tokens = tokenizer.tokenize(line_clean)
+
+ for i, token in enumerate(tokens):
+ features = getFeatures(token, i + 1, tokens)
+ output.append(joinLine([token] + features))
+ output.append("")
+ return "\n".join(output)