From 2c80980453a6a785a26d4002904676e39d945ae1 Mon Sep 17 00:00:00 2001 From: hay-kot Date: Sat, 28 Aug 2021 14:18:56 -0800 Subject: [PATCH] unicode fraction processing --- .../scraper/ingredient_nlp/processor.py | 19 ++++++++++++++++++- .../ingredient_nlp/unicode_fraction_dict.py | 0 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py diff --git a/mealie/services/scraper/ingredient_nlp/processor.py b/mealie/services/scraper/ingredient_nlp/processor.py index e3405efd0699..b225bdde9f10 100644 --- a/mealie/services/scraper/ingredient_nlp/processor.py +++ b/mealie/services/scraper/ingredient_nlp/processor.py @@ -1,5 +1,6 @@ import subprocess import tempfile +import unicodedata from fractions import Fraction from pathlib import Path from typing import Optional @@ -41,8 +42,24 @@ def _exec_crf_test(input_text): ) +def fraction_finder(string: str): + # TODO: I'm not confident this works well enough for production needs some testing and/or refacorting + for c in string: + try: + name = unicodedata.name(c) + except ValueError: + continue + if name.startswith("VULGAR FRACTION"): + normalized = unicodedata.normalize("NFKC", c) + numerator, _slash, denominator = normalized.partition("⁄") + text = f"{numerator}/{denominator}" + return string.replace(c, text) + + return string + + def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]): - crf_output = _exec_crf_test(list_of_ingrdeint_text) + crf_output = _exec_crf_test([fraction_finder(x) for x in list_of_ingrdeint_text]) crf_models = [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))] diff --git a/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py b/mealie/services/scraper/ingredient_nlp/unicode_fraction_dict.py new file mode 100644 index 000000000000..e69de29bb2d1