mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-07-09 03:04:54 -04:00
update NLP for ingredients
This commit is contained in:
parent
086098899d
commit
2e6352cfbd
@ -22,7 +22,7 @@ from mealie.schema.admin import (
|
|||||||
)
|
)
|
||||||
from mealie.schema.events import EventNotificationIn
|
from mealie.schema.events import EventNotificationIn
|
||||||
from mealie.schema.recipe import CommentOut, Recipe
|
from mealie.schema.recipe import CommentOut, Recipe
|
||||||
from mealie.schema.user import UpdateGroup, PrivateUser
|
from mealie.schema.user import PrivateUser, UpdateGroup
|
||||||
from mealie.services.image import minify
|
from mealie.services.image import minify
|
||||||
|
|
||||||
|
|
||||||
|
97
mealie/services/scraper/ingredient_nlp/pre_processor.py
Normal file
97
mealie/services/scraper/ingredient_nlp/pre_processor.py
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
replace_abbreviations = {
|
||||||
|
"cup ": "cup ",
|
||||||
|
"g ": "gram ",
|
||||||
|
"kg ": "kilogram ",
|
||||||
|
"lb ": "pound ",
|
||||||
|
"ml ": "milliliter ",
|
||||||
|
"oz ": "ounce ",
|
||||||
|
"pint ": "pint ",
|
||||||
|
"qt ": "quart ",
|
||||||
|
"tbs ": "tablespoon ",
|
||||||
|
"tbsp ": "tablespoon ",
|
||||||
|
"tsp ": "teaspoon ",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def replace_common_abbreviations(string: str) -> str:
|
||||||
|
for k, v in replace_abbreviations.items():
|
||||||
|
string = string.replace(k, v)
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
def remove_periods(string: str) -> str:
|
||||||
|
"""Removes periods not sournded by digets"""
|
||||||
|
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_fraction_unicode(string: str):
|
||||||
|
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
|
||||||
|
# TODO: Breaks on multiple unicode fractions
|
||||||
|
for c in string:
|
||||||
|
try:
|
||||||
|
name = unicodedata.name(c)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if name.startswith("VULGAR FRACTION"):
|
||||||
|
normalized = unicodedata.normalize("NFKC", c)
|
||||||
|
numerator, _slash, denominator = normalized.partition("⁄")
|
||||||
|
text = f" {numerator}/{denominator}"
|
||||||
|
return string.replace(c, text).replace(" ", " ")
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
def wrap_or_clause(string: str):
|
||||||
|
"""
|
||||||
|
Attempts to wrap or clauses in ()
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
|
||||||
|
|
||||||
|
"""
|
||||||
|
# TODO: Needs more adequite testing to be sure this doens't have side effects.
|
||||||
|
split_by_or = string.split(" or ")
|
||||||
|
|
||||||
|
split_by_comma = split_by_or[1].split(",")
|
||||||
|
|
||||||
|
if len(split_by_comma) > 0:
|
||||||
|
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
def pre_process_string(string: str) -> str:
|
||||||
|
"""
|
||||||
|
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
|
||||||
|
|
||||||
|
{qty} {unit} {food}, {additional}
|
||||||
|
1 tbs. wine, expensive or other white wine, plus more
|
||||||
|
|
||||||
|
"""
|
||||||
|
string = string.lower()
|
||||||
|
string = replace_fraction_unicode(string)
|
||||||
|
string = remove_periods(string)
|
||||||
|
string = replace_common_abbreviations(string)
|
||||||
|
|
||||||
|
if " or " in string:
|
||||||
|
string = wrap_or_clause(string)
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# TODO: Migrate to unittests
|
||||||
|
print("Starting...")
|
||||||
|
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
|
||||||
|
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
|
||||||
|
print(pre_process_string("¼ cup michiu tou or other rice wine"))
|
||||||
|
print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
|
||||||
|
print("Finished...")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -1,17 +1,17 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
import unicodedata
|
|
||||||
from fractions import Fraction
|
from fractions import Fraction
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, validator
|
||||||
|
|
||||||
from mealie.core.config import settings
|
from mealie.core.config import settings
|
||||||
from mealie.schema.recipe import RecipeIngredient
|
from mealie.schema.recipe import RecipeIngredient
|
||||||
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
|
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
|
from .pre_processor import pre_process_string
|
||||||
|
|
||||||
CWD = Path(__file__).parent
|
CWD = Path(__file__).parent
|
||||||
MODEL_PATH = CWD / "model.crfmodel"
|
MODEL_PATH = CWD / "model.crfmodel"
|
||||||
@ -33,6 +33,17 @@ class CRFIngredient(BaseModel):
|
|||||||
comment: Optional[str] = ""
|
comment: Optional[str] = ""
|
||||||
unit: Optional[str] = ""
|
unit: Optional[str] = ""
|
||||||
|
|
||||||
|
@validator("qty", always=True, pre=True)
|
||||||
|
def validate_qty(qty, values): # sourcery skip: merge-nested-ifs
|
||||||
|
if qty is None or qty == "":
|
||||||
|
# Check if other contains a fraction
|
||||||
|
if values["other"] is not None and values["other"].find("/") != -1:
|
||||||
|
return float(Fraction(values["other"])).__round__(1)
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return qty
|
||||||
|
|
||||||
|
|
||||||
def _exec_crf_test(input_text):
|
def _exec_crf_test(input_text):
|
||||||
with tempfile.NamedTemporaryFile(mode="w") as input_file:
|
with tempfile.NamedTemporaryFile(mode="w") as input_file:
|
||||||
@ -43,24 +54,8 @@ def _exec_crf_test(input_text):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def fraction_finder(string: str):
|
|
||||||
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
|
|
||||||
for c in string:
|
|
||||||
try:
|
|
||||||
name = unicodedata.name(c)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
if name.startswith("VULGAR FRACTION"):
|
|
||||||
normalized = unicodedata.normalize("NFKC", c)
|
|
||||||
numerator, _slash, denominator = normalized.partition("⁄")
|
|
||||||
text = f"{numerator}/{denominator}"
|
|
||||||
return string.replace(c, text)
|
|
||||||
|
|
||||||
return string
|
|
||||||
|
|
||||||
|
|
||||||
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
|
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
|
||||||
crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text])
|
crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
|
||||||
|
|
||||||
crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
|
crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
|
||||||
|
|
||||||
@ -89,4 +84,4 @@ if __name__ == "__main__":
|
|||||||
ingredients = convert_crf_models_to_ingredients(crf_models)
|
ingredients = convert_crf_models_to_ingredients(crf_models)
|
||||||
|
|
||||||
for ingredient in ingredients:
|
for ingredient in ingredients:
|
||||||
print(ingredient)
|
print(ingredient.input)
|
||||||
|
@ -28,6 +28,7 @@ def tokenize(s):
|
|||||||
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
|
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
|
||||||
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
|
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
# TODO: Replace american_units with list of units from database?
|
||||||
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
|
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
|
||||||
# The following removes slashes following American units and replaces it with a space.
|
# The following removes slashes following American units and replaces it with a space.
|
||||||
for unit in american_units:
|
for unit in american_units:
|
||||||
|
@ -47,7 +47,7 @@ def unclump(s):
|
|||||||
|
|
||||||
def normalizeToken(s):
|
def normalizeToken(s):
|
||||||
"""
|
"""
|
||||||
ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
|
TODO: FIX THIS. We used to use the pattern.en package to singularize words, but
|
||||||
in the name of simple deployments, we took it out. We should fix this at some
|
in the name of simple deployments, we took it out. We should fix this at some
|
||||||
point.
|
point.
|
||||||
"""
|
"""
|
||||||
@ -222,6 +222,20 @@ def import_data(lines):
|
|||||||
tag, confidence = re.split(r"/", columns[-1], 1)
|
tag, confidence = re.split(r"/", columns[-1], 1)
|
||||||
tag = re.sub(r"^[BI]\-", "", tag).lower()
|
tag = re.sub(r"^[BI]\-", "", tag).lower()
|
||||||
|
|
||||||
|
# TODO: Integrate Confidence into API Response
|
||||||
|
print("Confidence", confidence)
|
||||||
|
|
||||||
|
# new token
|
||||||
|
if prevTag != tag or token == "n/a":
|
||||||
|
display[-1].append((tag, [token]))
|
||||||
|
data[-1][tag] = []
|
||||||
|
prevTag = tag
|
||||||
|
|
||||||
|
# continuation
|
||||||
|
else:
|
||||||
|
display[-1][-1][1].append(token)
|
||||||
|
data[-1][tag].append(token)
|
||||||
|
|
||||||
# ---- DISPLAY ----
|
# ---- DISPLAY ----
|
||||||
# build a structure which groups each token by its tag, so we can
|
# build a structure which groups each token by its tag, so we can
|
||||||
# rebuild the original display name later.
|
# rebuild the original display name later.
|
||||||
|
35
tests/unit_tests/test_nlp_parser.py
Normal file
35
tests/unit_tests/test_nlp_parser.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from fractions import Fraction
|
||||||
|
|
||||||
|
from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TestIngredient:
|
||||||
|
input: str
|
||||||
|
quantity: float
|
||||||
|
|
||||||
|
|
||||||
|
test_ingredients = [
|
||||||
|
TestIngredient("½ cup all-purpose flour", 0.5),
|
||||||
|
TestIngredient("1 ½ teaspoons ground black pepper", 1.5),
|
||||||
|
TestIngredient("⅔ cup unsweetened flaked coconut", 0.7),
|
||||||
|
TestIngredient("⅓ cup panko bread crumbs", 0.3),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_nlp_parser():
|
||||||
|
models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
|
||||||
|
|
||||||
|
# Itterate over mdoels and test_ingreidnets to gether
|
||||||
|
print()
|
||||||
|
for model, test_ingredient in zip(models, test_ingredients):
|
||||||
|
print("Testing:", test_ingredient.input, end="")
|
||||||
|
|
||||||
|
assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity
|
||||||
|
|
||||||
|
print(" ✅ Passed")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_nlp_parser()
|
Loading…
x
Reference in New Issue
Block a user