From 2c80980453a6a785a26d4002904676e39d945ae1 Mon Sep 17 00:00:00 2001
From: hay-kot <hay-kot@pm.me>
Date: Sat, 28 Aug 2021 14:18:56 -0800
Subject: [PATCH] unicode fraction processing

---
 .../scraper/ingredient_nlp/processor.py       | 19 ++++++++++++++++++-
 .../ingredient_nlp/unicode_fraction_dict.py   |  0
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py

diff --git a/mealie/services/scraper/ingredient_nlp/processor.py b/mealie/services/scraper/ingredient_nlp/processor.py
index e3405efd0699..b225bdde9f10 100644
--- a/mealie/services/scraper/ingredient_nlp/processor.py
+++ b/mealie/services/scraper/ingredient_nlp/processor.py
@@ -1,5 +1,6 @@
 import subprocess
 import tempfile
+import unicodedata
 from fractions import Fraction
 from pathlib import Path
 from typing import Optional
@@ -41,8 +42,24 @@ def _exec_crf_test(input_text):
         )
 
 
+def fraction_finder(string: str):
+    # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
+    for c in string:
+        try:
+            name = unicodedata.name(c)
+        except ValueError:
+            continue
+        if name.startswith("VULGAR FRACTION"):
+            normalized = unicodedata.normalize("NFKC", c)
+            numerator, _slash, denominator = normalized.partition("⁄")
+            text = f"{numerator}/{denominator}"
+            return string.replace(c, text)
+
+    return string
+
+
 def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
-    crf_output = _exec_crf_test(list_of_ingrdeint_text)
+    crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text])
 
     crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
 
diff --git a/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py b/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py
new file mode 100644
index 000000000000..e69de29bb2d1