refactor: rewrite cleaner functions for parsing recipe dicts (#1743)

* rewrite cleaner functions

* unify verbage

* try importing dep during check

* fix syntax

* allow override defaults

* satisfy mypy
This commit is contained in:
Hayden 2022-11-10 15:16:51 -09:00 committed by GitHub
parent 77316d639b
commit 89d0cae51d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 918 additions and 347 deletions

View File

@ -54,7 +54,7 @@ jobs:
id: cache-validate id: cache-validate
if: steps.cached-poetry-dependencies.outputs.cache-hit == 'true' if: steps.cached-poetry-dependencies.outputs.cache-hit == 'true'
run: | run: |
echo "print('venv good?')" > test.py && poetry run python test.py && echo ::set-output name=cache-hit-success::true echo "import black;print('venv good?')" > test.py && poetry run python test.py && echo ::set-output name=cache-hit-success::true
rm test.py rm test.py
continue-on-error: true continue-on-error: true

View File

@ -1,14 +1,30 @@
import contextlib
import functools
import html import html
import json import json
import operator
import re import re
import typing
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Optional
from slugify import slugify from slugify import slugify
from mealie.core.root_logger import get_logger MATCH_DIGITS = re.compile(r"\d+([.,]\d+)?")
""" Allow for commas as decimals (common in Europe) """
logger = get_logger() MATCH_ISO_STR = re.compile(
r"^P((\d+)Y)?((\d+)M)?((?P<days>\d+)D)?" r"T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+(?:\.\d+)?)S)?$",
)
""" Match Duration Strings """
MATCH_HTML_TAGS = re.compile(r"<[^<]+?>")
""" Matches HTML tags `<p>Text</p>` -> `Text` """
MATCH_MULTI_SPACE = re.compile(r" +")
""" Matches multiple spaces `Hello World` -> `Hello World` """
MATCH_ERRONEOUS_WHITE_SPACE = re.compile(r"\n\s*\n")
""" Matches multiple new lines and removes erroneous white space """
def clean(recipe_data: dict, url=None) -> dict: def clean(recipe_data: dict, url=None) -> dict:
@ -27,167 +43,167 @@ def clean(recipe_data: dict, url=None) -> dict:
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime")) recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
recipe_data["performTime"] = clean_time(recipe_data.get("performTime")) recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime")) recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", [])) recipe_data["recipeCategory"] = clean_categories(recipe_data.get("recipeCategory", []))
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield")) recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient")) recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions")) recipe_data["image"] = clean_image(recipe_data.get("image"))
recipe_data["image"] = image(recipe_data.get("image")) recipe_data["slug"] = slugify(recipe_data.get("name", ""))
recipe_data["slug"] = slugify(recipe_data.get("name")) # type: ignore
recipe_data["orgURL"] = url recipe_data["orgURL"] = url
return recipe_data return recipe_data
def clean_string(text: str) -> str: def clean_string(text: str | list | int) -> str:
if isinstance(text, list): """Cleans a string of HTML tags and extra white space"""
text = text[0] if not isinstance(text, str):
if isinstance(text, list):
text = text[0]
if isinstance(text, int): if isinstance(text, int):
text = str(text) text = str(text)
if text == "" or text is None: if not text:
return "" return ""
text = typing.cast(str, text) # at this point we know text is a string
cleaned_text = html.unescape(text) cleaned_text = html.unescape(text)
cleaned_text = re.sub("<[^<]+?>", "", cleaned_text) cleaned_text = MATCH_HTML_TAGS.sub("", cleaned_text)
cleaned_text = re.sub(" +", " ", cleaned_text) cleaned_text = MATCH_MULTI_SPACE.sub(" ", cleaned_text)
cleaned_text = re.sub("</p>", "\n", cleaned_text) cleaned_text = MATCH_ERRONEOUS_WHITE_SPACE.sub("\n\n", cleaned_text)
cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text)
cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip() cleaned_text = cleaned_text.replace("</p>", "\n").replace("\xa0", " ").replace("\t", " ").strip()
return cleaned_text return cleaned_text
def category(category: str): def clean_image(image: str | list | dict | None = None, default="no image") -> str:
if isinstance(category, list) and len(category) > 0 and isinstance(category[0], dict): """
# If the category is a list of dicts, it's probably from a migration image attempts to parse the image field from a recipe and return a string. Currenty
# validate that the required fields are present
valid = []
for cat in category:
if "name" in cat and "slug" in cat:
valid.append(cat)
return valid Supported Structures:
- `["https://exmaple.com"]` - A list of strings
- `https://exmaple.com` - A string
- `{ "url": "https://exmaple.com"` - A dictionary with a `url` key
if isinstance(category, str) and category != "": Raises:
return [category] TypeError: If the image field is not a supported type a TypeError is raised.
return [] Returns:
str: "no image" if any empty string is provided or the url of the image
"""
def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]:
# Assumes that all units are supplied in grams, except sodium which may be in mg.
# Fn only expects a dict[str,str]. Other structures should not be parsed.
if not isinstance(nutrition, dict):
return {}
# Allow for commas as decimals (common in Europe)
# Compile once for efficiency
re_match_digits = re.compile(r"\d+([.,]\d+)?")
output_nutrition = {}
for key, val in nutrition.items():
# If the val contains digits matching the regex, add the first match to the output dict.
# Handle unexpected datastructures safely.
try:
if matched_digits := re_match_digits.search(val):
output_nutrition[key] = matched_digits.group(0)
except Exception:
continue
output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()}
if (
"sodiumContent" in nutrition
and type(nutrition["sodiumContent"]) == str
and "m" not in nutrition["sodiumContent"]
and "g" in nutrition["sodiumContent"]
):
# Sodium is in grams. Parse its value, multiple by 1k and return to string.
try:
output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
except ValueError:
# Could not parse sodium content as float, so don't touch it.
pass
return output_nutrition
def image(image=None) -> str:
if not image: if not image:
return "no image" return default
if isinstance(image, list):
return image[0] match image:
elif isinstance(image, dict): case str(image):
return image["url"] return image
elif isinstance(image, str): case list(image):
return image return image[0]
else: case {"url": str(image)}:
raise Exception(f"Unrecognised image URL format: {image}") return image
case _:
raise TypeError(f"Unexpected type for image: {type(image)}, {image}")
def instructions(instructions) -> list[dict]: def clean_instructions(steps_object: list | dict | str, default: list | None = None) -> list[dict]:
try: """
instructions = json.loads(instructions) instructions attempts to parse the instructions field from a recipe and return a list of
except Exception: dictionaries. See match statement for supported types and structures
pass
if not instructions: Raises:
return [] TypeError: If the instructions field is not a supported type a TypeError is raised.
# Dictionary (Keys: step number strings, Values: the instructions) Returns:
if isinstance(instructions, dict): list[dict]: An ordered list of dictionaries with the keys `text`
instructions = list(instructions.values()) """
if not steps_object:
return default or []
if isinstance(instructions, list) and isinstance(instructions[0], list): match steps_object:
instructions = instructions[0] case [{"text": str()}]: # Base Case
return steps_object
# One long string split by (possibly multiple) new lines case [{"text": str()}, *_]:
if isinstance(instructions, str): # The is the most common case. Most other operations eventually resolve to this
return [{"text": _instruction(line)} for line in instructions.splitlines() if line] # match case before being converted to a list of instructions
#
# Plain strings in a list # [
elif isinstance(instructions, list) and isinstance(instructions[0], str): # {"text": "Instruction A"},
return [{"text": _instruction(step)} for step in instructions] # {"text": "Instruction B"},
# ]
# Dictionaries (let's assume it's a HowToStep) in a list #
elif isinstance(instructions, list) and isinstance(instructions[0], dict): return [
# Try List of Dictionary without "@type" or "type" {"text": _sanitize_instruction_text(instruction["text"])}
if not instructions[0].get("@type", False) and not instructions[0].get("type", False): for instruction in steps_object
return [{"text": _instruction(step["text"])} for step in instructions] if instruction["text"].strip()
]
try: case {0: {"text": str()}} | {"0": {"text": str()}} | {1: {"text": str()}} | {"1": {"text": str()}}:
# If HowToStep is under HowToSection # Some recipes have a dict with a string key representing the index, unsure if these can
sectionSteps = [] # be an int or not so we match against both. Additionally, we match against both 0 and 1 indexed
for step in instructions: # list like dicts.
if step["@type"] == "HowToSection": #
for sectionStep in step["itemListElement"]: # {
sectionSteps.append(sectionStep) # "0": {"text": "Instruction A"},
# "1": {"text": "Instruction B"},
if len(sectionSteps) > 0: # }
return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"] #
steps_object = typing.cast(dict, steps_object)
return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"] return clean_instructions([x for x in steps_object.values()])
except Exception as e: case str(step_as_str):
logger.error(e) # Strings are weird, some sites return a single string with newlines
# Not "@type", try "type" # others returns a json string for some reasons
try: #
return [ # "Instruction A\nInstruction B\nInstruction C"
{"text": _instruction(step["properties"]["text"])} # '{"0": {"text": "Instruction A"}, "1": {"text": "Instruction B"}, "2": {"text": "Instruction C"}}'
for step in instructions #
if step["type"].find("HowToStep") > -1 if step_as_str.startswith("[") or step_as_str.startswith("{"):
] try:
except Exception: return clean_instructions(json.loads(step_as_str))
pass except json.JSONDecodeError:
pass
else: return [
raise Exception(f"Unrecognised instruction format: {instructions}") {"text": _sanitize_instruction_text(instruction)}
for instruction in step_as_str.splitlines()
return [] if instruction.strip()
]
case [str(), *_]:
# Assume list of strings is a valid list of instructions
#
# [
# "Instruction A",
# "Instruction B",
# ]
#
return [
{"text": _sanitize_instruction_text(instruction)} for instruction in steps_object if instruction.strip()
]
case [{"@type": "HowToSection"}, *_] | [{"type": "HowToSection"}, *_]:
# HowToSections should have the following layout,
# {
# "@type": "HowToSection",
# "itemListElement": [
# {
# "@type": "HowToStep",
# "text": "Instruction A"
# },
# }
#
steps_object = typing.cast(list[dict[str, str]], steps_object)
return clean_instructions(functools.reduce(operator.concat, [x["itemListElement"] for x in steps_object], [])) # type: ignore
case _:
raise TypeError(f"Unexpected type for instructions: {type(steps_object)}, {steps_object}")
def _instruction(line) -> str: def _sanitize_instruction_text(line: str | dict) -> str:
"""
_sanitize_instructions_text does some basic checking if the value is a string or dictionary
and returns the value of the `text` key if it is a dictionary. The returned string is passed through the
`clean_string` function to remove any html tags and extra whitespace in a loop until the string
is stable.
Calling `clean_string` in a loop is necessary because some sites return a string with erroneously escaped
html tags or markup.
"""
if isinstance(line, dict): if isinstance(line, dict):
# Some Recipes dotnot adhear to schema # Some Recipes dotnot adhear to schema
try: try:
@ -195,58 +211,111 @@ def _instruction(line) -> str:
except Exception: except Exception:
line = "" line = ""
if not line:
return ""
line = typing.cast(str, line)
clean_line = clean_string(line.strip()) clean_line = clean_string(line.strip())
# Some sites erroneously escape their strings on multiple levels
while not clean_line == (clean_line := clean_string(clean_line)): while not clean_line == (clean_line := clean_string(clean_line)):
pass pass
return clean_line return clean_line
def ingredient(ingredients: list | None) -> list[str]: def clean_ingredients(ingredients: list | str | None, default: list = None) -> list[str]:
if ingredients: """
return [clean_string(ing) for ing in ingredients] ingredient attempts to parse the ingredients field from a recipe and return a list of
else:
return [] Supported Structures:
- `["1 cup flour"]` - A list of strings
- `"1 cup flour"` - A string
- `None` - returns an empty list
Raises:
TypeError: If the ingredients field is not a supported type a TypeError is raised.
"""
match ingredients:
case None:
return default or []
case list(ingredients):
return [clean_string(ingredient) for ingredient in ingredients]
case str(ingredients):
return [clean_string(ingredient) for ingredient in ingredients.splitlines()]
case _:
raise TypeError(f"Unexpected type for ingredients: {type(ingredients)}, {ingredients}")
def yield_amount(yld) -> str: def clean_yield(yld: str | list[str] | None) -> str:
"""
yield_amount attemps to parse out the yield amount from a recipe.
Supported Structures:
- `"4 servings"` - returns the string unmodified
- `["4 servings", "4 Pies"]` - returns the last value
Returns:
str: The yield amount, if it can be parsed else an empty string
"""
if not yld:
return ""
if isinstance(yld, list): if isinstance(yld, list):
return yld[-1] return yld[-1]
else:
return yld return yld
def clean_time(time_entry): def clean_time(time_entry: str | timedelta | None) -> None | str:
if time_entry is None or time_entry == "" or time_entry == " ": """_summary_
Supported Structures:
- `None` - returns None
- `"PT1H"` - returns "1 hour"
- `"PT1H30M"` - returns "1 hour 30 minutes"
- `timedelta(hours=1, minutes=30)` - returns "1 hour 30 minutes"
Raises:
TypeError: if the type is not supported a TypeError is raised
Returns:
None | str: None if the time_entry is None, otherwise a string representing the time
"""
if not time_entry:
return None return None
elif isinstance(time_entry, timedelta):
return pretty_print_timedelta(time_entry) match time_entry:
elif isinstance(time_entry, datetime): case str(time_entry):
pass if not time_entry.strip():
# print(time_entry) return None
elif isinstance(time_entry, str):
try: try:
time_delta_object = parse_duration(time_entry) time_delta_instructionsect = parse_duration(time_entry)
return pretty_print_timedelta(time_delta_object) return pretty_print_timedelta(time_delta_instructionsect)
except ValueError: except ValueError:
logger.error(f"Could not parse time_entry `{time_entry}`") return str(time_entry)
case timedelta():
return pretty_print_timedelta(time_entry)
case datetime():
# TODO: Not sure what to do here
return str(time_entry) return str(time_entry)
else: case _:
return str(time_entry) raise TypeError(f"Unexpected type for time: {type(time_entry)}, {time_entry}")
def parse_duration(iso_duration): def parse_duration(iso_duration: str) -> timedelta:
"""Parses an ISO 8601 duration string into a datetime.timedelta instance. """
Parses an ISO 8601 duration string into a datetime.timedelta instance.
Args: Args:
iso_duration: an ISO 8601 duration string. iso_duration: an ISO 8601 duration string.
Returns:
a datetime.timedelta instance Raises:
ValueError: if the input string is not a valid ISO 8601 duration string.
""" """
m = re.match(
r"^P((\d+)Y)?((\d+)M)?((?P<days>\d+)D)?" m = MATCH_ISO_STR.match(iso_duration)
r"T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+(?:\.\d+)?)S)?$",
iso_duration,
)
if m is None: if m is None:
raise ValueError("invalid ISO 8601 duration string") raise ValueError("invalid ISO 8601 duration string")
@ -257,7 +326,7 @@ def parse_duration(iso_duration):
# convert parsed years and months to specific number of days. # convert parsed years and months to specific number of days.
times = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0} times = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0}
for unit, _ in times.items(): for unit in times.keys():
if m.group(unit): if m.group(unit):
times[unit] = int(float(m.group(unit))) times[unit] = int(float(m.group(unit)))
@ -299,30 +368,73 @@ def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places
return " ".join(out_list) return " ".join(out_list)
def clean_categories(category: str | list) -> list[str]:
if not category:
return []
match category:
case str(category):
if not category.strip():
return []
return [category]
case [str(), *_]:
return [cat.strip().title() for cat in category if cat.strip()]
case [{"name": str(), "slug": str()}, *_]:
# Special case for when we use the cleaner to cleanup a migration.
#
# [
# { "name": "Dessert", "slug": "dessert"}
# ]
#
return [cat["name"] for cat in category if "name" in cat]
case _:
raise TypeError(f"Unexpected type for category: {type(category)}, {category}")
def clean_tags(data: str | list[str]) -> list[str]: def clean_tags(data: str | list[str]) -> list[str]:
""" """
Gets keywords as a list or natural language list and returns them into a list of strings of individual tags Gets keywords as a list or natural language list and returns
them into a list of strings of individual tags
""" """
if data is None: if not data:
return [] return []
if isinstance(data, list): match data:
all_str = True case [str(), *_]:
i = 0 return [tag.strip().title() for tag in data if tag.strip()]
while all_str and i < len(data): case str(data):
all_str = isinstance(data[i], str) return clean_tags([t for t in data.split(",")])
i = i + 1 case _:
return []
# should probably raise exception
# raise TypeError(f"Unexpected type for tags: {type(data)}, {data}")
if all_str:
return data
return []
if isinstance(data, str): def clean_nutrition(nutrition: dict | None) -> dict[str, str]:
tag_list = data.split(",") """
clean_nutrition takes a dictionary of nutrition information and cleans it up
to be stored in the database. It will remove any keys that are not in the
list of valid keys
for i in range(len(tag_list)): Assumptionas:
tag_list[i] = tag_list[i].strip().capitalize() - All units are supplied in grams, expect sodium which maybe be in milligrams
return tag_list Returns:
dict[str, str]: If the argument is None, or not a dictionary, an empty dictionary is returned
"""
if not isinstance(nutrition, dict):
return {}
return [] output_nutrition = {}
for key, val in nutrition.items():
with contextlib.suppress(AttributeError, TypeError):
if matched_digits := MATCH_DIGITS.search(val):
output_nutrition[key] = matched_digits.group(0).replace(",", ".")
if sodium := nutrition.get("sodiumContent", None):
if isinstance(sodium, str) and "m" not in sodium and "g" in sodium:
with contextlib.suppress(AttributeError, TypeError):
output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
return output_nutrition

View File

@ -123,7 +123,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
self.logger.debug(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}") self.logger.debug(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
instruction_as_text = cleaner.instructions(instruction_as_text) instruction_as_text = cleaner.clean_instructions(instruction_as_text)
self.logger.debug(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}") self.logger.debug(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
@ -147,7 +147,9 @@ class RecipeScraperPackage(ABCScraperStrategy):
description=try_get_default(None, "description", "", cleaner.clean_string), description=try_get_default(None, "description", "", cleaner.clean_string),
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition), nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient), recipe_ingredient=try_get_default(
scraped_data.ingredients, "recipeIngredient", [""], cleaner.clean_ingredients
),
recipe_instructions=get_instructions(), recipe_instructions=get_instructions(),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time), total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time), prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),

View File

@ -128,20 +128,20 @@ def test_create_by_url_with_tags(
response = api_client.get(api_routes.recipes_slug(slug), headers=unique_user.token) response = api_client.get(api_routes.recipes_slug(slug), headers=unique_user.token)
assert response.status_code == 200 assert response.status_code == 200
# Verifiy the tags are present # Verifiy the tags are present and title cased
expected_tags = { expected_tags = {
"sauté", "Sauté",
"pea", "Pea",
"noodle", "Noodle",
"udon noodle", "Udon Noodle",
"ramen noodle", "Ramen Noodle",
"dinner", "Dinner",
"main", "Main",
"vegetarian", "Vegetarian",
"easy", "Easy",
"quick", "Quick",
"weeknight meals", "Weeknight Meals",
"web", "Web",
} }
recipe = json.loads(response.text) recipe = json.loads(response.text)

View File

@ -0,0 +1,56 @@
import json
import re
from pathlib import Path
import pytest
from mealie.services.scraper import cleaner
from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph
from tests import data as test_data
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
url_validation_regex = re.compile(
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
test_cleaner_data = [
(test_data.json_best_homemade_salsa_recipe, 2),
(test_data.json_blue_cheese_stuffed_turkey_meatballs_with_raspberry_balsamic_glaze_2, 3),
(test_data.json_bon_appetit, 8),
(test_data.json_chunky_apple_cake, 4),
(test_data.json_dairy_free_impossible_pumpkin_pie, 7),
(test_data.json_how_to_make_instant_pot_spaghetti, 8),
(test_data.json_instant_pot_chicken_and_potatoes, 4),
(test_data.json_instant_pot_kerala_vegetable_stew, 13),
(test_data.json_jalapeno_popper_dip, 4),
(test_data.json_microwave_sweet_potatoes_04783, 4),
(test_data.json_moroccan_skirt_steak_with_roasted_pepper_couscous, 4),
(test_data.json_pizza_knoblauch_champignon_paprika_vegan_html, 3),
]
@pytest.mark.parametrize("json_file,num_steps", test_cleaner_data)
def test_cleaner_clean(json_file: Path, num_steps):
recipe_data = cleaner.clean(json.loads(json_file.read_text()))
assert len(recipe_data["recipeInstructions"]) == num_steps
def test_html_with_recipe_data():
path = test_data.html_healthy_pasta_bake_60759
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
open_graph_strategy = RecipeScraperOpenGraph(url)
recipe_data = open_graph_strategy.get_recipe_fields(path.read_text())
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])

View File

@ -0,0 +1,541 @@
from dataclasses import dataclass
from datetime import timedelta
from typing import Any
import pytest
from mealie.services.scraper import cleaner
@dataclass(slots=True)
class CleanerCase:
test_id: str
input: Any
expected: Any
exception: Any = None
clean_string_test_cases = (
CleanerCase(
test_id="empty_string",
input="",
expected="",
),
CleanerCase(
test_id="html",
input="<p> Hello World </p>",
expected="Hello World",
),
CleanerCase(
test_id="no_change",
input="Hello World",
expected="Hello World",
),
CleanerCase(
test_id="html_with_extra_closing_tag",
input="<p> Hello World </p></p>",
expected="Hello World",
),
CleanerCase(
test_id="multiple_spaces",
input="Hello World",
expected="Hello World",
),
CleanerCase(
test_id="tabs",
input="\tHello World\t",
expected="Hello World",
),
CleanerCase(
test_id="nbsp",
input="\xa0Hello World\xa0",
expected="Hello World",
),
CleanerCase(
test_id="list",
input=["Hello World", "Goodbye World"],
expected="Hello World",
),
CleanerCase(
test_id="int",
input=1,
expected="1",
),
)
@pytest.mark.parametrize("case", clean_string_test_cases, ids=(x.test_id for x in clean_string_test_cases))
def test_cleaner_clean_string(case: CleanerCase) -> None:
assert case.expected == cleaner.clean_string(case.input)
image_cleaner_test_cases = (
CleanerCase(
test_id="empty_string",
input="",
expected="no image",
),
CleanerCase(
test_id="no_change",
input="https://example.com/image.jpg",
expected="https://example.com/image.jpg",
),
CleanerCase(
test_id="dict with url key",
input={"url": "https://example.com/image.jpg"},
expected="https://example.com/image.jpg",
),
CleanerCase(
test_id="list of strings",
input=["https://example.com/image.jpg"],
expected="https://example.com/image.jpg",
),
)
@pytest.mark.parametrize("case", image_cleaner_test_cases, ids=(x.test_id for x in image_cleaner_test_cases))
def test_cleaner_image_cleaner(case: CleanerCase):
result = cleaner.clean_image(case.input)
assert case.expected == result
instruction_test_cases = (
CleanerCase(
test_id="single string",
input="Instruction A\nInstruction B\nInstruction C",
expected=None,
),
CleanerCase(
test_id="single string multiple newlines",
input="Instruction A\n\nInstruction B\n\nInstruction C",
expected=None,
),
CleanerCase(
test_id="common list of dicts",
input=[
{"text": "Instruction A"},
{"text": "Instruction B"},
{"text": "Instruction C"},
],
expected=None,
),
CleanerCase(
test_id="dict with int keys",
input={
0: {"text": "Instruction A"},
1: {"text": "Instruction B"},
2: {"text": "Instruction C"},
},
expected=None,
),
CleanerCase(
test_id="dict with str num keys",
input={
"0": {"text": "Instruction A"},
"1": {"text": "Instruction B"},
"2": {"text": "Instruction C"},
},
expected=None,
),
CleanerCase(
test_id="dict with str num keys",
input={
"1": {"text": "Instruction A"},
"2": {"text": "Instruction B"},
"3": {"text": "Instruction C"},
},
expected=None,
),
CleanerCase(
test_id="dict with str num keys",
input={
1: {"text": "Instruction A"},
2: {"text": "Instruction B"},
3: {"text": "Instruction C"},
},
expected=None,
),
CleanerCase(
test_id="raw json str",
input='{"0": {"text": "Instruction A"}, "1": {"text": "Instruction B"}, "2": {"text": "Instruction C"}}',
expected=None,
),
CleanerCase(
test_id="how to steps",
input=[
{
"@type": "HowToSection",
"itemListElement": [
{
"@type": "HowToStep",
"text": "Instruction A",
},
{
"@type": "HowToStep",
"text": "Instruction B",
},
],
},
{
"@type": "HowToSection",
"itemListElement": [
{
"@type": "HowToStep",
"text": "Instruction C",
},
],
},
],
expected=None,
),
CleanerCase(
test_id="excessive whitespace str (1)",
input="Instruction A\n\nInstruction B\n\nInstruction C\n\n",
expected=None,
),
CleanerCase(
test_id="excessive whitespace str (2)",
input="Instruction A\nInstruction B\nInstruction C\n",
expected=None,
),
CleanerCase(
test_id="excessive whitespace str (3)",
input="Instruction A\r\n\r\nInstruction B\r\n\r\nInstruction C\r\n\r\n",
expected=None,
),
CleanerCase(
test_id="excessive whitespace str (4)",
input="Instruction A\r\nInstruction B\r\nInstruction C\r\n",
expected=None,
),
)
@pytest.mark.parametrize("instructions", instruction_test_cases, ids=(x.test_id for x in instruction_test_cases))
def test_cleaner_instructions(instructions: CleanerCase):
reuslt = cleaner.clean_instructions(instructions.input)
expected = [
{"text": "Instruction A"},
{"text": "Instruction B"},
{"text": "Instruction C"},
]
assert reuslt == expected
ingredients_test_cases = (
CleanerCase(
input="",
expected=[],
test_id="empty string",
),
CleanerCase(
input="1 cup of flour",
expected=["1 cup of flour"],
test_id="single ingredient string",
),
CleanerCase(
input=["1 cup of flour"],
expected=["1 cup of flour"],
test_id="single ingredient list",
),
CleanerCase(
input=["1 cup of flour", "1 cup of sugar"],
expected=["1 cup of flour", "1 cup of sugar"],
test_id="multiple ingredient list",
),
CleanerCase(
input={"0": "1 cup of flour", "1": "1 cup of sugar"},
expected=None,
test_id="multiple ingredient dictionary",
exception=TypeError,
),
)
@pytest.mark.parametrize("ingredients", ingredients_test_cases, ids=(x.test_id for x in ingredients_test_cases))
def test_cleaner_clean_ingredients(ingredients: CleanerCase):
if ingredients.exception:
with pytest.raises(ingredients.exception):
cleaner.clean_ingredients(ingredients.input)
return
assert ingredients.expected == cleaner.clean_ingredients(ingredients.input)
yield_test_cases = (
CleanerCase(
test_id="empty string",
input="",
expected="",
),
CleanerCase(
test_id="list of strings",
input=["Makes 4 Batches", "4 Batches"],
expected="4 Batches",
),
CleanerCase(
test_id="basic string",
input="Makes 4 Batches",
expected="Makes 4 Batches",
),
CleanerCase(
test_id="empty list",
input=[],
expected="",
),
)
@pytest.mark.parametrize("case", yield_test_cases, ids=(x.test_id for x in yield_test_cases))
def test_cleaner_clean_yield_amount(case: CleanerCase):
result = cleaner.clean_yield(case.input)
assert case.expected == result
time_test_cases = (
CleanerCase(
test_id="empty string",
input="",
expected=None,
),
CleanerCase(
test_id="emtpy whitespace",
input=" ",
expected=None,
),
CleanerCase(
test_id="none",
input=None,
expected=None,
),
CleanerCase(
test_id="invalid string",
input="invalid",
expected="invalid",
),
CleanerCase(
test_id="timedelta",
input=timedelta(minutes=30),
expected="30 Minutes",
),
CleanerCase(
test_id="timedelta string (1)",
input="PT2H30M",
expected="2 Hours 30 Minutes",
),
CleanerCase(
test_id="timedelta string (2)",
input="PT30M",
expected="30 Minutes",
),
CleanerCase(
test_id="timedelta string (3)",
input="PT2H",
expected="2 Hours",
),
CleanerCase(
test_id="timedelta string (4)",
input="P1DT1H1M1S",
expected="1 day 1 Hour 1 Minute 1 Second",
),
CleanerCase(
test_id="timedelta string (4)",
input="P1DT1H1M1.53S",
expected="1 day 1 Hour 1 Minute 1 Second",
),
CleanerCase(
test_id="timedelta string (5) invalid",
input="PT",
expected="none",
),
CleanerCase(
test_id="timedelta string (6) PT-3H",
input="PT-3H",
expected="PT-3H",
),
)
@pytest.mark.parametrize("case", time_test_cases, ids=(x.test_id for x in time_test_cases))
def test_cleaner_clean_time(case: CleanerCase):
result = cleaner.clean_time(case.input)
assert case.expected == result
category_test_cases = (
CleanerCase(
test_id="empty string",
input="",
expected=[],
),
CleanerCase(
test_id="emtpy whitespace",
input=" ",
expected=[],
),
CleanerCase(
test_id="emtpy list",
input=[],
expected=[],
),
CleanerCase(
test_id="single string",
input="Dessert",
expected=["Dessert"],
),
CleanerCase(
test_id="nested dictionary",
input=[
{"name": "Dessert", "slug": "dessert"},
{"name": "Lunch", "slug": "lunch"},
],
expected=["Dessert", "Lunch"],
),
)
@pytest.mark.parametrize("case", category_test_cases, ids=(x.test_id for x in category_test_cases))
def test_cleaner_clean_categories(case: CleanerCase):
result = cleaner.clean_categories(case.input)
assert case.expected == result
tag_test_cases = (
CleanerCase(
test_id="empty string",
input="",
expected=[],
),
CleanerCase(
test_id="single tag",
input="tag",
expected=["Tag"],
),
CleanerCase(
test_id="comma separated tags",
input="tag1, tag2, tag3",
expected=["Tag1", "Tag2", "Tag3"],
),
CleanerCase(
test_id="list of tags",
input=["tag1", "tag2", "tag3"],
expected=["Tag1", "Tag2", "Tag3"],
),
)
@pytest.mark.parametrize("case", tag_test_cases, ids=(x.test_id for x in tag_test_cases))
def test_cleaner_clean_tags(case: CleanerCase):
result = cleaner.clean_tags(case.input)
assert case.expected == result
nutrition_test_cases = (
CleanerCase(
test_id="empty dict",
input={},
expected={},
),
CleanerCase(
test_id="valid kets",
input={
"calories": "100mg",
"fatContent": "10",
},
expected={
"calories": "100",
"fatContent": "10",
},
),
CleanerCase(
test_id="invalid keys get removed",
input={
"calories": "100mg",
"fatContent": "10",
"invalid": "invalid",
},
expected={
"calories": "100",
"fatContent": "10",
},
),
CleanerCase(
test_id="support `,` seperated numbers instead of `.` (common in Europe)",
input={
"calories": "100,000mg",
"fatContent": "10,000",
},
expected={
"calories": "100.000",
"fatContent": "10.000",
},
),
CleanerCase(
test_id="special support for sodiumContent (g -> mg)",
input={
"sodiumContent": "10g",
},
expected={
"sodiumContent": "10000.0",
},
),
CleanerCase(
test_id="special support for sodiumContent (mg -> mg)",
input={
"sodiumContent": "10000mg",
},
expected={
"sodiumContent": "10000",
},
),
CleanerCase(
test_id="strip units",
input={
"calories": "100 kcal",
},
expected={
"calories": "100",
},
),
CleanerCase(
test_id="list as value continues after first value",
input={
"calories": ["100 kcal"],
"sugarContent": "but still tries 555.321",
},
expected={
"sugarContent": "555.321",
},
),
CleanerCase(
test_id="multiple decimals",
input={
"sodiumContent": "10.1.2g",
},
expected={
"sodiumContent": "10100.0",
},
),
)
@pytest.mark.parametrize("case", nutrition_test_cases, ids=(x.test_id for x in nutrition_test_cases))
def test_cleaner_clean_nutrition(case: CleanerCase):
result = cleaner.clean_nutrition(case.input)
assert case.expected == result
@pytest.mark.parametrize(
"t,max_components,max_decimal_places,expected",
[
(timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"),
(timedelta(days=2, seconds=17280), 1, 2, "2.2 days"),
(timedelta(days=365), None, 2, "1 year"),
],
)
def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected):
assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected

View File

@ -1,140 +0,0 @@
import json
import re
from datetime import timedelta
from pathlib import Path
import pytest
from mealie.services.scraper import cleaner
from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph
from tests import data as test_data
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
url_validation_regex = re.compile(
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
test_cleaner_data = [
(test_data.json_best_homemade_salsa_recipe, 2),
(test_data.json_blue_cheese_stuffed_turkey_meatballs_with_raspberry_balsamic_glaze_2, 3),
(test_data.json_bon_appetit, 8),
(test_data.json_chunky_apple_cake, 4),
(test_data.json_dairy_free_impossible_pumpkin_pie, 7),
(test_data.json_how_to_make_instant_pot_spaghetti, 8),
(test_data.json_instant_pot_chicken_and_potatoes, 4),
(test_data.json_instant_pot_kerala_vegetable_stew, 13),
(test_data.json_jalapeno_popper_dip, 4),
(test_data.json_microwave_sweet_potatoes_04783, 4),
(test_data.json_moroccan_skirt_steak_with_roasted_pepper_couscous, 4),
(test_data.json_pizza_knoblauch_champignon_paprika_vegan_html, 3),
]
@pytest.mark.parametrize(
"json_file,num_steps",
test_cleaner_data,
)
def test_cleaner_clean(json_file: Path, num_steps):
recipe_data = cleaner.clean(json.loads(json_file.read_text()))
assert len(recipe_data["recipeInstructions"]) == num_steps
def test_clean_category():
assert cleaner.category("my-category") == ["my-category"]
def test_clean_string():
assert cleaner.clean_string("<div>Hello World</div>") == "Hello World"
def test_clean_image():
assert cleaner.image(None) == "no image"
assert cleaner.image("https://my.image/path/") == "https://my.image/path/"
assert cleaner.image({"url": "My URL!"}) == "My URL!"
assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
@pytest.mark.parametrize(
"nutrition,expected",
[
(None, {}),
({"calories": "105 kcal"}, {"calories": "105"}),
({"calories": "105 kcal 104 sugar"}, {"calories": "105"}),
({"calories": ""}, {}),
({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}),
({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}),
({"sodiumContent": "5mg"}, {"sodiumContent": "5"}),
({"sodiumContent": "10oz"}, {"sodiumContent": "10"}),
({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}),
],
)
def test_clean_nutrition(nutrition, expected):
assert cleaner.clean_nutrition(nutrition) == expected
@pytest.mark.parametrize(
"instructions",
[
"A\n\nB\n\nC\n\n",
"A\nB\nC\n",
"A\r\n\r\nB\r\n\r\nC\r\n\r\n",
"A\r\nB\r\nC\r\n",
["A", "B", "C"],
[{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]],
],
)
def test_cleaner_instructions(instructions):
assert cleaner.instructions(instructions) == [
{"text": "A"},
{"text": "B"},
{"text": "C"},
]
def test_html_with_recipe_data():
path = test_data.html_healthy_pasta_bake_60759
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
open_graph_strategy = RecipeScraperOpenGraph(url)
recipe_data = open_graph_strategy.get_recipe_fields(path.read_text())
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
@pytest.mark.parametrize(
"time_delta,expected",
[
("PT2H30M", "2 Hours 30 Minutes"),
("PT30M", "30 Minutes"),
("PT3H", "3 Hours"),
("P1DT1H1M1S", "1 day 1 Hour 1 Minute 1 Second"),
("P1DT1H1M1.53S", "1 day 1 Hour 1 Minute 1 Second"),
("PT-3H", "PT-3H"),
("PT", "none"),
],
)
def test_time_cleaner(time_delta, expected):
assert cleaner.clean_time(time_delta) == expected
@pytest.mark.parametrize(
"t,max_components,max_decimal_places,expected",
[
(timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"),
(timedelta(days=2, seconds=17280), 1, 2, "2.2 days"),
(timedelta(days=365), None, 2, "1 year"),
],
)
def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected):
assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected