From 2e6352cfbd01cac0d264680ee5d6c23887f2ddcc Mon Sep 17 00:00:00 2001
From: hay-kot <hay-kot@pm.me>
Date: Sun, 29 Aug 2021 17:10:51 -0800
Subject: [PATCH] update NLP for ingredients

---
 mealie/services/backups/imports.py            |  2 +-
 .../scraper/ingredient_nlp/pre_processor.py   | 97 +++++++++++++++++++
 .../scraper/ingredient_nlp/processor.py       | 35 +++----
 .../scraper/ingredient_nlp/tokenizer.py       |  1 +
 .../ingredient_nlp/unicode_fraction_dict.py   |  0
 .../services/scraper/ingredient_nlp/utils.py  | 16 ++-
 tests/unit_tests/test_nlp_parser.py           | 35 +++++++
 7 files changed, 164 insertions(+), 22 deletions(-)
 create mode 100644 mealie/services/scraper/ingredient_nlp/pre_processor.py
 delete mode 100644 mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py
 create mode 100644 tests/unit_tests/test_nlp_parser.py

diff --git a/mealie/services/backups/imports.py b/mealie/services/backups/imports.py
index 742705eabe6b..d46a1a475088 100644
--- a/mealie/services/backups/imports.py
+++ b/mealie/services/backups/imports.py
@@ -22,7 +22,7 @@ from mealie.schema.admin import (
 )
 from mealie.schema.events import EventNotificationIn
 from mealie.schema.recipe import CommentOut, Recipe
-from mealie.schema.user import UpdateGroup, PrivateUser
+from mealie.schema.user import PrivateUser, UpdateGroup
 from mealie.services.image import minify
 
 
diff --git a/mealie/services/scraper/ingredient_nlp/pre_processor.py b/mealie/services/scraper/ingredient_nlp/pre_processor.py
new file mode 100644
index 000000000000..a6a5d4726c47
--- /dev/null
+++ b/mealie/services/scraper/ingredient_nlp/pre_processor.py
@@ -0,0 +1,97 @@
+import re
+import unicodedata
+
+replace_abbreviations = {
+    "cup ": "cup ",
+    "g ": "gram ",
+    "kg ": "kilogram ",
+    "lb ": "pound ",
+    "ml ": "milliliter ",
+    "oz ": "ounce ",
+    "pint ": "pint ",
+    "qt ": "quart ",
+    "tbs ": "tablespoon ",
+    "tbsp ": "tablespoon ",
+    "tsp ": "teaspoon ",
+}
+
+
+def replace_common_abbreviations(string: str) -> str:
+    for k, v in replace_abbreviations.items():
+        string = string.replace(k, v)
+
+    return string
+
+
+def remove_periods(string: str) -> str:
+    """Removes periods not sournded by digets"""
+    return re.sub(r"(?<!\d)\.(?!\d)", "", string)
+
+
+def replace_fraction_unicode(string: str):
+    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
+    # TODO: Breaks on multiple unicode fractions
+    for c in string:
+        try:
+            name = unicodedata.name(c)
+        except ValueError:
+            continue
+        if name.startswith("VULGAR FRACTION"):
+            normalized = unicodedata.normalize("NFKC", c)
+            numerator, _slash, denominator = normalized.partition("⁄")
+            text = f" {numerator}/{denominator}"
+            return string.replace(c, text).replace("  ", " ")
+
+    return string
+
+
+def wrap_or_clause(string: str):
+    """
+    Attempts to wrap or clauses in ()
+
+    Examples:
+    '1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
+
+    """
+    # TODO: Needs more adequite testing to be sure this doens't have side effects.
+    split_by_or = string.split(" or ")
+
+    split_by_comma = split_by_or[1].split(",")
+
+    if len(split_by_comma) > 0:
+        return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
+
+    return string
+
+
+def pre_process_string(string: str) -> str:
+    """
+    Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
+
+    {qty} {unit} {food}, {additional}
+    1 tbs. wine, expensive or other white wine, plus more
+
+    """
+    string = string.lower()
+    string = replace_fraction_unicode(string)
+    string = remove_periods(string)
+    string = replace_common_abbreviations(string)
+
+    if " or " in string:
+        string = wrap_or_clause(string)
+
+    return string
+
+
+def main():
+    # TODO: Migrate to unittests
+    print("Starting...")
+    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more"))
+    print(pre_process_string("1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt"))
+    print(pre_process_string("¼ cup michiu tou or other rice wine"))
+    print(pre_process_string("1 tbs. wine, expensive or other white wine, plus more"))
+    print("Finished...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mealie/services/scraper/ingredient_nlp/processor.py b/mealie/services/scraper/ingredient_nlp/processor.py
index 4dafe2c65687..7e18709cf78a 100644
--- a/mealie/services/scraper/ingredient_nlp/processor.py
+++ b/mealie/services/scraper/ingredient_nlp/processor.py
@@ -1,17 +1,17 @@
 import subprocess
 import tempfile
-import unicodedata
 from fractions import Fraction
 from pathlib import Path
 from typing import Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
 
 from mealie.core.config import settings
 from mealie.schema.recipe import RecipeIngredient
 from mealie.schema.recipe.recipe_ingredient import CreateIngredientFood, CreateIngredientUnit
 
 from . import utils
+from .pre_processor import pre_process_string
 
 CWD = Path(__file__).parent
 MODEL_PATH = CWD / "model.crfmodel"
@@ -33,6 +33,17 @@ class CRFIngredient(BaseModel):
     comment: Optional[str] = ""
     unit: Optional[str] = ""
 
+    @validator("qty", always=True, pre=True)
+    def validate_qty(qty, values):  # sourcery skip: merge-nested-ifs
+        if qty is None or qty == "":
+            # Check if other contains a fraction
+            if values["other"] is not None and values["other"].find("/") != -1:
+                return float(Fraction(values["other"])).__round__(1)
+            else:
+                return 1
+
+        return qty
+
 
 def _exec_crf_test(input_text):
     with tempfile.NamedTemporaryFile(mode="w") as input_file:
@@ -43,24 +54,8 @@ def _exec_crf_test(input_text):
         )
 
 
-def fraction_finder(string: str):
-    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
-    for c in string:
-        try:
-            name = unicodedata.name(c)
-        except ValueError:
-            continue
-        if name.startswith("VULGAR FRACTION"):
-            normalized = unicodedata.normalize("NFKC", c)
-            numerator, _slash, denominator = normalized.partition("⁄")
-            text = f"{numerator}/{denominator}"
-            return string.replace(c, text)
-
-    return string
-
-
 def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
-    crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text])
+    crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
 
     crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
 
@@ -89,4 +84,4 @@ if __name__ == "__main__":
     ingredients = convert_crf_models_to_ingredients(crf_models)
 
     for ingredient in ingredients:
-        print(ingredient)
+        print(ingredient.input)
diff --git a/mealie/services/scraper/ingredient_nlp/tokenizer.py b/mealie/services/scraper/ingredient_nlp/tokenizer.py
index 4973d388a692..d899bfb45dcb 100644
--- a/mealie/services/scraper/ingredient_nlp/tokenizer.py
+++ b/mealie/services/scraper/ingredient_nlp/tokenizer.py
@@ -28,6 +28,7 @@ def tokenize(s):
     s = re.sub(r"(\d+)oz", r"\1 ounces", s)
     s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
 
+    # TODO: Replace american_units with list of units from database?
     american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
     # The following removes slashes following American units and replaces it with a space.
     for unit in american_units:
diff --git a/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py b/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/mealie/services/scraper/ingredient_nlp/utils.py b/mealie/services/scraper/ingredient_nlp/utils.py
index f573ea93a8c7..4b49d5908abc 100644
--- a/mealie/services/scraper/ingredient_nlp/utils.py
+++ b/mealie/services/scraper/ingredient_nlp/utils.py
@@ -47,7 +47,7 @@ def unclump(s):
 
 def normalizeToken(s):
     """
-    ToDo: FIX THIS. We used to use the pattern.en package to singularize words, but
+    TODO: FIX THIS. We used to use the pattern.en package to singularize words, but
     in the name of simple deployments, we took it out. We should fix this at some
     point.
     """
@@ -222,6 +222,20 @@ def import_data(lines):
             tag, confidence = re.split(r"/", columns[-1], 1)
             tag = re.sub(r"^[BI]\-", "", tag).lower()
 
+            # TODO: Integrate Confidence into API Response
+            print("Confidence", confidence)
+
+            # new token
+            if prevTag != tag or token == "n/a":
+                display[-1].append((tag, [token]))
+                data[-1][tag] = []
+                prevTag = tag
+
+            # continuation
+            else:
+                display[-1][-1][1].append(token)
+                data[-1][tag].append(token)
+
             # ---- DISPLAY ----
             # build a structure which groups each token by its tag, so we can
             # rebuild the original display name later.
diff --git a/tests/unit_tests/test_nlp_parser.py b/tests/unit_tests/test_nlp_parser.py
new file mode 100644
index 000000000000..c3e6ecb12baa
--- /dev/null
+++ b/tests/unit_tests/test_nlp_parser.py
@@ -0,0 +1,35 @@
+from dataclasses import dataclass
+from fractions import Fraction
+
+from mealie.services.scraper.ingredient_nlp.processor import CRFIngredient, convert_list_to_crf_model
+
+
+@dataclass
+class TestIngredient:
+    input: str
+    quantity: float
+
+
+test_ingredients = [
+    TestIngredient("½ cup all-purpose flour", 0.5),
+    TestIngredient("1 ½ teaspoons ground black pepper", 1.5),
+    TestIngredient("⅔ cup unsweetened flaked coconut", 0.7),
+    TestIngredient("⅓ cup panko bread crumbs", 0.3),
+]
+
+
+def test_nlp_parser():
+    models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
+
+    # Itterate over mdoels and test_ingreidnets to gether
+    print()
+    for model, test_ingredient in zip(models, test_ingredients):
+        print("Testing:", test_ingredient.input, end="")
+
+        assert float(sum(Fraction(s) for s in model.qty.split())) == test_ingredient.quantity
+
+        print(" ✅ Passed")
+
+
+if __name__ == "__main__":
+    test_nlp_parser()