mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-08-07 09:02:02 -04:00
Fix issue with parsing scraped nutrition (#732)
* Fix issue with parsing scraped nutrition * Attempt to clean nutrition info * Allow comma separator * Fix return type for clean_nutrition. Fail safe in case of unexpected type from scraper * Switch to using regex parsing * Formatting * Cleanup - empty strings no longer a concern
This commit is contained in:
parent
756ffc8e90
commit
b81f88dc18
@ -112,7 +112,7 @@ class RecipeModel(SqlAlchemyBase, BaseMixins):
|
|||||||
self.image = image
|
self.image = image
|
||||||
self.recipeCuisine = recipeCuisine
|
self.recipeCuisine = recipeCuisine
|
||||||
|
|
||||||
self.nutrition = Nutrition(**nutrition) if self.nutrition else Nutrition()
|
self.nutrition = Nutrition(**nutrition) if nutrition else Nutrition()
|
||||||
|
|
||||||
self.tools = [Tool(tool=x) for x in tools] if tools else []
|
self.tools = [Tool(tool=x) for x in tools] if tools else []
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ import html
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
|
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
@ -67,6 +67,40 @@ def clean_html(raw_html):
|
|||||||
return re.sub(cleanr, "", raw_html)
|
return re.sub(cleanr, "", raw_html)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]:
|
||||||
|
# Assumes that all units are supplied in grams, except sodium which may be in mg.
|
||||||
|
|
||||||
|
# Fn only expects a dict[str,str]. Other structures should not be parsed.
|
||||||
|
if not isinstance(nutrition, dict):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Allow for commas as decimals (common in Europe)
|
||||||
|
# Compile once for efficiency
|
||||||
|
re_match_digits = re.compile(r"\d+([.,]\d+)?")
|
||||||
|
|
||||||
|
output_nutrition = {}
|
||||||
|
for key, val in nutrition.items():
|
||||||
|
# If the val contains digits matching the regex, add the first match to the output dict.
|
||||||
|
# Handle unexpected datastructures safely.
|
||||||
|
try:
|
||||||
|
if matched_digits := re_match_digits.search(val):
|
||||||
|
output_nutrition[key] = matched_digits.group(0)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()}
|
||||||
|
|
||||||
|
if "sodiumContent" in nutrition and "m" not in nutrition["sodiumContent"] and "g" in nutrition["sodiumContent"]:
|
||||||
|
# Sodium is in grams. Parse its value, multiple by 1k and return to string.
|
||||||
|
try:
|
||||||
|
output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
|
||||||
|
except ValueError:
|
||||||
|
# Could not parse sodium content as float, so don't touch it.
|
||||||
|
pass
|
||||||
|
|
||||||
|
return output_nutrition
|
||||||
|
|
||||||
|
|
||||||
def image(image=None) -> str:
|
def image(image=None) -> str:
|
||||||
if not image:
|
if not image:
|
||||||
return "no image"
|
return "no image"
|
||||||
|
@ -137,6 +137,7 @@ def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) ->
|
|||||||
slug="",
|
slug="",
|
||||||
image=try_get_default(scraped_data.image, "image", None),
|
image=try_get_default(scraped_data.image, "image", None),
|
||||||
description=try_get_default(None, "description", "", cleaner.clean_string),
|
description=try_get_default(None, "description", "", cleaner.clean_string),
|
||||||
|
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
|
||||||
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
|
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
|
||||||
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
|
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
|
||||||
recipe_instructions=get_instructions(),
|
recipe_instructions=get_instructions(),
|
||||||
|
@ -58,6 +58,24 @@ def test_clean_image():
|
|||||||
assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
|
assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"nutrition,expected",
|
||||||
|
[
|
||||||
|
(None, {}),
|
||||||
|
({"calories": "105 kcal"}, {"calories": "105"}),
|
||||||
|
({"calories": "105 kcal 104 sugar"}, {"calories": "105"}),
|
||||||
|
({"calories": ""}, {}),
|
||||||
|
({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}),
|
||||||
|
({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}),
|
||||||
|
({"sodiumContent": "5mg"}, {"sodiumContent": "5"}),
|
||||||
|
({"sodiumContent": "10oz"}, {"sodiumContent": "10"}),
|
||||||
|
({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_clean_nutrition(nutrition, expected):
|
||||||
|
assert cleaner.clean_nutrition(nutrition) == expected
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"instructions",
|
"instructions",
|
||||||
[
|
[
|
||||||
|
Loading…
x
Reference in New Issue
Block a user