Fix issue with parsing scraped nutrition (#732)

* Fix issue with parsing scraped nutrition

* Attempt to clean nutrition info

* Allow comma separator

* Fix return type for clean_nutrition. Fail safe in case of unexpected type from scraper

* Switch to using regex parsing

* Formatting

* Cleanup - empty strings no longer a concern
This commit is contained in:
cadamswaite 2021-10-20 01:01:05 +01:00 committed by GitHub
parent 756ffc8e90
commit b81f88dc18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 2 deletions

View File

@ -112,7 +112,7 @@ class RecipeModel(SqlAlchemyBase, BaseMixins):
self.image = image self.image = image
self.recipeCuisine = recipeCuisine self.recipeCuisine = recipeCuisine
self.nutrition = Nutrition(**nutrition) if self.nutrition else Nutrition() self.nutrition = Nutrition(**nutrition) if nutrition else Nutrition()
self.tools = [Tool(tool=x) for x in tools] if tools else [] self.tools = [Tool(tool=x) for x in tools] if tools else []

View File

@ -2,7 +2,7 @@ import html
import json import json
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import List from typing import List, Optional
from slugify import slugify from slugify import slugify
@ -67,6 +67,40 @@ def clean_html(raw_html):
return re.sub(cleanr, "", raw_html) return re.sub(cleanr, "", raw_html)
def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]:
# Assumes that all units are supplied in grams, except sodium which may be in mg.
# Fn only expects a dict[str,str]. Other structures should not be parsed.
if not isinstance(nutrition, dict):
return {}
# Allow for commas as decimals (common in Europe)
# Compile once for efficiency
re_match_digits = re.compile(r"\d+([.,]\d+)?")
output_nutrition = {}
for key, val in nutrition.items():
# If the val contains digits matching the regex, add the first match to the output dict.
# Handle unexpected datastructures safely.
try:
if matched_digits := re_match_digits.search(val):
output_nutrition[key] = matched_digits.group(0)
except Exception:
continue
output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()}
if "sodiumContent" in nutrition and "m" not in nutrition["sodiumContent"] and "g" in nutrition["sodiumContent"]:
# Sodium is in grams. Parse its value, multiple by 1k and return to string.
try:
output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
except ValueError:
# Could not parse sodium content as float, so don't touch it.
pass
return output_nutrition
def image(image=None) -> str: def image(image=None) -> str:
if not image: if not image:
return "no image" return "no image"

View File

@ -137,6 +137,7 @@ def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) ->
slug="", slug="",
image=try_get_default(scraped_data.image, "image", None), image=try_get_default(scraped_data.image, "image", None),
description=try_get_default(None, "description", "", cleaner.clean_string), description=try_get_default(None, "description", "", cleaner.clean_string),
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient), recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
recipe_instructions=get_instructions(), recipe_instructions=get_instructions(),

View File

@ -58,6 +58,24 @@ def test_clean_image():
assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
@pytest.mark.parametrize(
"nutrition,expected",
[
(None, {}),
({"calories": "105 kcal"}, {"calories": "105"}),
({"calories": "105 kcal 104 sugar"}, {"calories": "105"}),
({"calories": ""}, {}),
({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}),
({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}),
({"sodiumContent": "5mg"}, {"sodiumContent": "5"}),
({"sodiumContent": "10oz"}, {"sodiumContent": "10"}),
({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}),
],
)
def test_clean_nutrition(nutrition, expected):
assert cleaner.clean_nutrition(nutrition) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
"instructions", "instructions",
[ [