update NLP for ingredients

This commit is contained in:
hay-kot 2021-08-29 17:10:51 -08:00
parent 086098899d
commit 2e6352cfbd
7 changed files with 164 additions and 22 deletions

View File

@ -22,7 +22,7 @@ from mealie.schema.admin import (
) )
from mealie.schema.events import EventNotificationIn from mealie.schema.events import EventNotificationIn
from mealie.schema.recipe import CommentOut, Recipe from mealie.schema.recipe import CommentOut, Recipe
from mealie.schema.user import UpdateGroup, PrivateUser from mealie.schema.user import PrivateUser, UpdateGroup
from mealie.services.image import minify from mealie.services.image import minify

View File

@ -0,0 +1,97 @@
import re
import unicodedata
replace_abbreviations = {
"cup ": "cup ",
"g ": "gram ",
"kg ": "kilogram ",
"lb ": "pound ",
"ml ": "milliliter ",
"oz ": "ounce ",
"pint ": "pint ",
"qt ": "quart ",
"tbs ": "tablespoon ",
"tbsp ": "tablespoon ",
"tsp ": "teaspoon ",
}
def replace_common_abbreviations(string: str) -> str:
for k, v in replace_abbreviations.items():
string = string.replace(k, v)
return string
def remove_periods(string: str) -> str:
"""Removes periods not sournded by digets"""
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
def replace_fraction_unicode(string: str):
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
# TODO: Breaks on multiple unicode fractions
for c in string:
try:
name = unicodedata.name(c)
except ValueError:
continue
if name.startswith("VULGAR FRACTION"):
normalized = unicodedata.normalize("NFKC", c)
numerator, _slash, denominator = normalized.partition("")
text = f" {numerator}/{denominator}"
return string.replace(c, text).replace(" ", " ")
return string
def wrap_or_clause(string: str):
"""
Attempts to wrap or clauses in ()
Examples:
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
"""
# TODO: Needs more adequite testing to be sure this doens't have side effects.
split_by_or = string.split(" or ")
split_by_comma = split_by_or[1].split(",")
if len(split_by_comma) > 0:
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
return string
def pre_process_string(string: str) -> str:
"""
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
{qty} {unit} {food}, {additional}
1 tbs. wine, expensive or other white wine, plus more
"""
string = string.lower()
string = replace_fraction_unicode(string)
string = remove_periods(string)
string = replace_common_abbreviations(string)
if " or " in string:
string = wrap_or_clause(string)
return string
def main():
# TODO: Migrate to unittests
print("Starting...")
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
print(pre_process_string("¼ cup michiu tou or other rice wine"))
print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
print("Finished...")
if __name__ == "__main__":
main()

View File

@ -1,17 +1,17 @@
import subprocess import subprocess
import tempfile import tempfile
import unicodedata
from fractions import Fraction from fractions import Fraction
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from pydantic import BaseModel from pydantic import BaseModel, validator
from mealie.core.config import settings from mealie.core.config import settings
from mealie.schema.recipe import RecipeIngredient from mealie.schema.recipe import RecipeIngredient
from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
from . import utils from . import utils
from .pre_processor import pre_process_string
CWD = Path(__file__).parent CWD = Path(__file__).parent
MODEL_PATH = CWD / "model.crfmodel" MODEL_PATH = CWD / "model.crfmodel"
@ -33,6 +33,17 @@ class CRFIngredient(BaseModel):
comment: Optional[str] = "" comment: Optional[str] = ""
unit: Optional[str] = "" unit: Optional[str] = ""
@validator("qty", always=True, pre=True)
def validate_qty(qty, values): # sourcery skip: merge-nested-ifs
if qty is None or qty == "":
# Check if other contains a fraction
if values["other"] is not None and values["other"].find("/") != -1:
return float(Fraction(values["other"])).__round__(1)
else:
return 1
return qty
def _exec_crf_test(input_text): def _exec_crf_test(input_text):
with tempfile.NamedTemporaryFile(mode="w") as input_file: with tempfile.NamedTemporaryFile(mode="w") as input_file:
@ -43,24 +54,8 @@ def _exec_crf_test(input_text):
) )
def fraction_finder(string: str):
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
for c in string:
try:
name = unicodedata.name(c)
except ValueError:
continue
if name.startswith("VULGAR FRACTION"):
normalized = unicodedata.normalize("NFKC", c)
numerator, _slash, denominator = normalized.partition("")
text = f"{numerator}/{denominator}"
return string.replace(c, text)
return string
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]): def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text]) crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))] crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
@ -89,4 +84,4 @@ if __name__ == "__main__":
ingredients = convert_crf_models_to_ingredients(crf_models) ingredients = convert_crf_models_to_ingredients(crf_models)
for ingredient in ingredients: for ingredient in ingredients:
print(ingredient) print(ingredient.input)

View File

@ -28,6 +28,7 @@ def tokenize(s):
s = re.sub(r"(\d+)oz", r"\1 ounces", s) s = re.sub(r"(\d+)oz", r"\1 ounces", s)
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE) s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
# TODO: Replace american_units with list of units from database?
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"] american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
# The following removes slashes following American units and replaces it with a space. # The following removes slashes following American units and replaces it with a space.
for unit in american_units: for unit in american_units:

View File

@ -47,7 +47,7 @@ def unclump(s):
def normalizeToken(s): def normalizeToken(s):
""" """
ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but TODO: FIX THIS. We used to use the pattern.en package to singularize words, but
in the name of simple deployments, we took it out. We should fix this at some in the name of simple deployments, we took it out. We should fix this at some
point. point.
""" """
@ -222,6 +222,20 @@ def import_data(lines):
tag, confidence = re.split(r"/", columns[-1], 1) tag, confidence = re.split(r"/", columns[-1], 1)
tag = re.sub(r"^[BI]\-", "", tag).lower() tag = re.sub(r"^[BI]\-", "", tag).lower()
# TODO: Integrate Confidence into API Response
print("Confidence", confidence)
# new token
if prevTag != tag or token == "n/a":
display[-1].append((tag, [token]))
data[-1][tag] = []
prevTag = tag
# continuation
else:
display[-1][-1][1].append(token)
data[-1][tag].append(token)
# ---- DISPLAY ---- # ---- DISPLAY ----
# build a structure which groups each token by its tag, so we can # build a structure which groups each token by its tag, so we can
# rebuild the original display name later. # rebuild the original display name later.

View File

@ -0,0 +1,35 @@
from dataclasses import dataclass
from fractions import Fraction
from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model
@dataclass
class TestIngredient:
input: str
quantity: float
test_ingredients = [
TestIngredient("½ cup all-purpose flour", 0.5),
TestIngredient("1½ teaspoons ground black pepper", 1.5),
TestIngredient("⅔ cup unsweetened flaked coconut", 0.7),
TestIngredient("⅓ cup panko bread crumbs", 0.3),
]
def test_nlp_parser():
models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
# Itterate over mdoels and test_ingreidnets to gether
print()
for model, test_ingredient in zip(models, test_ingredients):
print("Testing:", test_ingredient.input, end="")
assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity
print(" ✅ Passed")
if __name__ == "__main__":
test_nlp_parser()