From b81f88dc1848125067e43486e3a3519231aeef4e Mon Sep 17 00:00:00 2001 From: cadamswaite Date: Wed, 20 Oct 2021 01:01:05 +0100 Subject: [PATCH] Fix issue with parsing scraped nutrition (#732) * Fix issue with parsing scraped nutrition * Attempt to clean nutrition info * Allow comma separator * Fix return type for clean_nutrition. Fail safe in case of unexpected type from scraper * Switch to using regex parsing * Formatting * Cleanup - empty strings no longer a concern --- mealie/db/models/recipe/recipe.py | 2 +- mealie/services/scraper/cleaner.py | 36 +++++++++++++++++++++++++++++- mealie/services/scraper/scraper.py | 1 + tests/unit_tests/test_cleaner.py | 18 +++++++++++++++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/mealie/db/models/recipe/recipe.py b/mealie/db/models/recipe/recipe.py index e55f36dec6e6..cd1827b0283c 100644 --- a/mealie/db/models/recipe/recipe.py +++ b/mealie/db/models/recipe/recipe.py @@ -112,7 +112,7 @@ class RecipeModel(SqlAlchemyBase, BaseMixins): self.image = image self.recipeCuisine = recipeCuisine - self.nutrition = Nutrition(**nutrition) if self.nutrition else Nutrition() + self.nutrition = Nutrition(**nutrition) if nutrition else Nutrition() self.tools = [Tool(tool=x) for x in tools] if tools else [] diff --git a/mealie/services/scraper/cleaner.py b/mealie/services/scraper/cleaner.py index 0b410a04f6c3..b844ca545df0 100644 --- a/mealie/services/scraper/cleaner.py +++ b/mealie/services/scraper/cleaner.py @@ -2,7 +2,7 @@ import html import json import re from datetime import datetime, timedelta -from typing import List +from typing import List, Optional from slugify import slugify @@ -67,6 +67,40 @@ def clean_html(raw_html): return re.sub(cleanr, "", raw_html) +def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]: + # Assumes that all units are supplied in grams, except sodium which may be in mg. + + # Fn only expects a dict[str,str]. Other structures should not be parsed. + if not isinstance(nutrition, dict): + return {} + + # Allow for commas as decimals (common in Europe) + # Compile once for efficiency + re_match_digits = re.compile(r"\d+([.,]\d+)?") + + output_nutrition = {} + for key, val in nutrition.items(): + # If the val contains digits matching the regex, add the first match to the output dict. + # Handle unexpected datastructures safely. + try: + if matched_digits := re_match_digits.search(val): + output_nutrition[key] = matched_digits.group(0) + except Exception: + continue + + output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()} + + if "sodiumContent" in nutrition and "m" not in nutrition["sodiumContent"] and "g" in nutrition["sodiumContent"]: + # Sodium is in grams. Parse its value, multiple by 1k and return to string. + try: + output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000) + except ValueError: + # Could not parse sodium content as float, so don't touch it. + pass + + return output_nutrition + + def image(image=None) -> str: if not image: return "no image" diff --git a/mealie/services/scraper/scraper.py b/mealie/services/scraper/scraper.py index 6b6a83d35fbf..8d65ce604381 100644 --- a/mealie/services/scraper/scraper.py +++ b/mealie/services/scraper/scraper.py @@ -137,6 +137,7 @@ def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> slug="", image=try_get_default(scraped_data.image, "image", None), description=try_get_default(None, "description", "", cleaner.clean_string), + nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition), recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient), recipe_instructions=get_instructions(), diff --git a/tests/unit_tests/test_cleaner.py b/tests/unit_tests/test_cleaner.py index f695c0077271..e9ef2d8a6740 100644 --- a/tests/unit_tests/test_cleaner.py +++ b/tests/unit_tests/test_cleaner.py @@ -58,6 +58,24 @@ def test_clean_image(): assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" +@pytest.mark.parametrize( + "nutrition,expected", + [ + (None, {}), + ({"calories": "105 kcal"}, {"calories": "105"}), + ({"calories": "105 kcal 104 sugar"}, {"calories": "105"}), + ({"calories": ""}, {}), + ({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}), + ({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}), + ({"sodiumContent": "5mg"}, {"sodiumContent": "5"}), + ({"sodiumContent": "10oz"}, {"sodiumContent": "10"}), + ({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}), + ], +) +def test_clean_nutrition(nutrition, expected): + assert cleaner.clean_nutrition(nutrition) == expected + + @pytest.mark.parametrize( "instructions", [