diff --git a/.github/workflows/partial-backend.yml b/.github/workflows/partial-backend.yml index 436efb5581e6..a7db5aa7cb2d 100644 --- a/.github/workflows/partial-backend.yml +++ b/.github/workflows/partial-backend.yml @@ -54,7 +54,7 @@ jobs: id: cache-validate if: steps.cached-poetry-dependencies.outputs.cache-hit == 'true' run: | - echo "print('venv good?')" > test.py && poetry run python test.py && echo ::set-output name=cache-hit-success::true + echo "import black;print('venv good?')" > test.py && poetry run python test.py && echo ::set-output name=cache-hit-success::true rm test.py continue-on-error: true diff --git a/mealie/services/scraper/cleaner.py b/mealie/services/scraper/cleaner.py index 02b19a734c0c..9e44b84facfa 100644 --- a/mealie/services/scraper/cleaner.py +++ b/mealie/services/scraper/cleaner.py @@ -1,14 +1,30 @@ +import contextlib +import functools import html import json +import operator import re +import typing from datetime import datetime, timedelta -from typing import Optional from slugify import slugify -from mealie.core.root_logger import get_logger +MATCH_DIGITS = re.compile(r"\d+([.,]\d+)?") +""" Allow for commas as decimals (common in Europe) """ -logger = get_logger() +MATCH_ISO_STR = re.compile( + r"^P((\d+)Y)?((\d+)M)?((?P\d+)D)?" r"T((?P\d+)H)?((?P\d+)M)?((?P\d+(?:\.\d+)?)S)?$", +) +""" Match Duration Strings """ + +MATCH_HTML_TAGS = re.compile(r"<[^<]+?>") +""" Matches HTML tags `

Text

` -> `Text` """ + +MATCH_MULTI_SPACE = re.compile(r" +") +""" Matches multiple spaces `Hello World` -> `Hello World` """ + +MATCH_ERRONEOUS_WHITE_SPACE = re.compile(r"\n\s*\n") +""" Matches multiple new lines and removes erroneous white space """ def clean(recipe_data: dict, url=None) -> dict: @@ -27,167 +43,167 @@ def clean(recipe_data: dict, url=None) -> dict: recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime")) recipe_data["performTime"] = clean_time(recipe_data.get("performTime")) recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime")) - recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", [])) - - recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield")) - recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient")) - recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions")) - recipe_data["image"] = image(recipe_data.get("image")) - recipe_data["slug"] = slugify(recipe_data.get("name")) # type: ignore + recipe_data["recipeCategory"] = clean_categories(recipe_data.get("recipeCategory", [])) + recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield")) + recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", [])) + recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", [])) + recipe_data["image"] = clean_image(recipe_data.get("image")) + recipe_data["slug"] = slugify(recipe_data.get("name", "")) recipe_data["orgURL"] = url return recipe_data -def clean_string(text: str) -> str: - if isinstance(text, list): - text = text[0] +def clean_string(text: str | list | int) -> str: + """Cleans a string of HTML tags and extra white space""" + if not isinstance(text, str): + if isinstance(text, list): + text = text[0] - if isinstance(text, int): - text = str(text) + if isinstance(text, int): + text = str(text) - if text == "" or text is None: + if not text: return "" + text = typing.cast(str, text) # at this point we know text is a string + cleaned_text = html.unescape(text) - cleaned_text = re.sub("<[^<]+?>", "", cleaned_text) - cleaned_text = re.sub(" +", " ", cleaned_text) - cleaned_text = re.sub("

", "\n", cleaned_text) - cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text) - cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip() + cleaned_text = MATCH_HTML_TAGS.sub("", cleaned_text) + cleaned_text = MATCH_MULTI_SPACE.sub(" ", cleaned_text) + cleaned_text = MATCH_ERRONEOUS_WHITE_SPACE.sub("\n\n", cleaned_text) + + cleaned_text = cleaned_text.replace("

", "\n").replace("\xa0", " ").replace("\t", " ").strip() return cleaned_text -def category(category: str): - if isinstance(category, list) and len(category) > 0 and isinstance(category[0], dict): - # If the category is a list of dicts, it's probably from a migration - # validate that the required fields are present - valid = [] - for cat in category: - if "name" in cat and "slug" in cat: - valid.append(cat) +def clean_image(image: str | list | dict | None = None, default="no image") -> str: + """ + image attempts to parse the image field from a recipe and return a string. Currenty - return valid + Supported Structures: + - `["https://exmaple.com"]` - A list of strings + - `https://exmaple.com` - A string + - `{ "url": "https://exmaple.com"` - A dictionary with a `url` key - if isinstance(category, str) and category != "": - return [category] + Raises: + TypeError: If the image field is not a supported type a TypeError is raised. - return [] - - -def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]: - # Assumes that all units are supplied in grams, except sodium which may be in mg. - - # Fn only expects a dict[str,str]. Other structures should not be parsed. - if not isinstance(nutrition, dict): - return {} - - # Allow for commas as decimals (common in Europe) - # Compile once for efficiency - re_match_digits = re.compile(r"\d+([.,]\d+)?") - - output_nutrition = {} - for key, val in nutrition.items(): - # If the val contains digits matching the regex, add the first match to the output dict. - # Handle unexpected datastructures safely. - try: - if matched_digits := re_match_digits.search(val): - output_nutrition[key] = matched_digits.group(0) - except Exception: - continue - - output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()} - - if ( - "sodiumContent" in nutrition - and type(nutrition["sodiumContent"]) == str - and "m" not in nutrition["sodiumContent"] - and "g" in nutrition["sodiumContent"] - ): - # Sodium is in grams. Parse its value, multiple by 1k and return to string. - try: - output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000) - except ValueError: - # Could not parse sodium content as float, so don't touch it. - pass - - return output_nutrition - - -def image(image=None) -> str: + Returns: + str: "no image" if any empty string is provided or the url of the image + """ if not image: - return "no image" - if isinstance(image, list): - return image[0] - elif isinstance(image, dict): - return image["url"] - elif isinstance(image, str): - return image - else: - raise Exception(f"Unrecognised image URL format: {image}") + return default + + match image: + case str(image): + return image + case list(image): + return image[0] + case {"url": str(image)}: + return image + case _: + raise TypeError(f"Unexpected type for image: {type(image)}, {image}") -def instructions(instructions) -> list[dict]: - try: - instructions = json.loads(instructions) - except Exception: - pass +def clean_instructions(steps_object: list | dict | str, default: list | None = None) -> list[dict]: + """ + instructions attempts to parse the instructions field from a recipe and return a list of + dictionaries. See match statement for supported types and structures - if not instructions: - return [] + Raises: + TypeError: If the instructions field is not a supported type a TypeError is raised. - # Dictionary (Keys: step number strings, Values: the instructions) - if isinstance(instructions, dict): - instructions = list(instructions.values()) + Returns: + list[dict]: An ordered list of dictionaries with the keys `text` + """ + if not steps_object: + return default or [] - if isinstance(instructions, list) and isinstance(instructions[0], list): - instructions = instructions[0] - - # One long string split by (possibly multiple) new lines - if isinstance(instructions, str): - return [{"text": _instruction(line)} for line in instructions.splitlines() if line] - - # Plain strings in a list - elif isinstance(instructions, list) and isinstance(instructions[0], str): - return [{"text": _instruction(step)} for step in instructions] - - # Dictionaries (let's assume it's a HowToStep) in a list - elif isinstance(instructions, list) and isinstance(instructions[0], dict): - # Try List of Dictionary without "@type" or "type" - if not instructions[0].get("@type", False) and not instructions[0].get("type", False): - return [{"text": _instruction(step["text"])} for step in instructions] - - try: - # If HowToStep is under HowToSection - sectionSteps = [] - for step in instructions: - if step["@type"] == "HowToSection": - for sectionStep in step["itemListElement"]: - sectionSteps.append(sectionStep) - - if len(sectionSteps) > 0: - return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"] - - return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"] - except Exception as e: - logger.error(e) - # Not "@type", try "type" - try: - return [ - {"text": _instruction(step["properties"]["text"])} - for step in instructions - if step["type"].find("HowToStep") > -1 - ] - except Exception: - pass - - else: - raise Exception(f"Unrecognised instruction format: {instructions}") - - return [] + match steps_object: + case [{"text": str()}]: # Base Case + return steps_object + case [{"text": str()}, *_]: + # The is the most common case. Most other operations eventually resolve to this + # match case before being converted to a list of instructions + # + # [ + # {"text": "Instruction A"}, + # {"text": "Instruction B"}, + # ] + # + return [ + {"text": _sanitize_instruction_text(instruction["text"])} + for instruction in steps_object + if instruction["text"].strip() + ] + case {0: {"text": str()}} | {"0": {"text": str()}} | {1: {"text": str()}} | {"1": {"text": str()}}: + # Some recipes have a dict with a string key representing the index, unsure if these can + # be an int or not so we match against both. Additionally, we match against both 0 and 1 indexed + # list like dicts. + # + # { + # "0": {"text": "Instruction A"}, + # "1": {"text": "Instruction B"}, + # } + # + steps_object = typing.cast(dict, steps_object) + return clean_instructions([x for x in steps_object.values()]) + case str(step_as_str): + # Strings are weird, some sites return a single string with newlines + # others returns a json string for some reasons + # + # "Instruction A\nInstruction B\nInstruction C" + # '{"0": {"text": "Instruction A"}, "1": {"text": "Instruction B"}, "2": {"text": "Instruction C"}}' + # + if step_as_str.startswith("[") or step_as_str.startswith("{"): + try: + return clean_instructions(json.loads(step_as_str)) + except json.JSONDecodeError: + pass + return [ + {"text": _sanitize_instruction_text(instruction)} + for instruction in step_as_str.splitlines() + if instruction.strip() + ] + case [str(), *_]: + # Assume list of strings is a valid list of instructions + # + # [ + # "Instruction A", + # "Instruction B", + # ] + # + return [ + {"text": _sanitize_instruction_text(instruction)} for instruction in steps_object if instruction.strip() + ] + case [{"@type": "HowToSection"}, *_] | [{"type": "HowToSection"}, *_]: + # HowToSections should have the following layout, + # { + # "@type": "HowToSection", + # "itemListElement": [ + # { + # "@type": "HowToStep", + # "text": "Instruction A" + # }, + # } + # + steps_object = typing.cast(list[dict[str, str]], steps_object) + return clean_instructions(functools.reduce(operator.concat, [x["itemListElement"] for x in steps_object], [])) # type: ignore + case _: + raise TypeError(f"Unexpected type for instructions: {type(steps_object)}, {steps_object}") -def _instruction(line) -> str: +def _sanitize_instruction_text(line: str | dict) -> str: + """ + _sanitize_instructions_text does some basic checking if the value is a string or dictionary + and returns the value of the `text` key if it is a dictionary. The returned string is passed through the + `clean_string` function to remove any html tags and extra whitespace in a loop until the string + is stable. + + Calling `clean_string` in a loop is necessary because some sites return a string with erroneously escaped + html tags or markup. + """ if isinstance(line, dict): # Some Recipes dotnot adhear to schema try: @@ -195,58 +211,111 @@ def _instruction(line) -> str: except Exception: line = "" + if not line: + return "" + + line = typing.cast(str, line) clean_line = clean_string(line.strip()) - # Some sites erroneously escape their strings on multiple levels + while not clean_line == (clean_line := clean_string(clean_line)): pass + return clean_line -def ingredient(ingredients: list | None) -> list[str]: - if ingredients: - return [clean_string(ing) for ing in ingredients] - else: - return [] +def clean_ingredients(ingredients: list | str | None, default: list = None) -> list[str]: + """ + ingredient attempts to parse the ingredients field from a recipe and return a list of + + Supported Structures: + - `["1 cup flour"]` - A list of strings + - `"1 cup flour"` - A string + - `None` - returns an empty list + + Raises: + TypeError: If the ingredients field is not a supported type a TypeError is raised. + """ + match ingredients: + case None: + return default or [] + case list(ingredients): + return [clean_string(ingredient) for ingredient in ingredients] + case str(ingredients): + return [clean_string(ingredient) for ingredient in ingredients.splitlines()] + case _: + raise TypeError(f"Unexpected type for ingredients: {type(ingredients)}, {ingredients}") -def yield_amount(yld) -> str: +def clean_yield(yld: str | list[str] | None) -> str: + """ + yield_amount attemps to parse out the yield amount from a recipe. + + Supported Structures: + - `"4 servings"` - returns the string unmodified + - `["4 servings", "4 Pies"]` - returns the last value + + Returns: + str: The yield amount, if it can be parsed else an empty string + """ + if not yld: + return "" + if isinstance(yld, list): return yld[-1] - else: - return yld + + return yld -def clean_time(time_entry): - if time_entry is None or time_entry == "" or time_entry == " ": +def clean_time(time_entry: str | timedelta | None) -> None | str: + """_summary_ + + Supported Structures: + - `None` - returns None + - `"PT1H"` - returns "1 hour" + - `"PT1H30M"` - returns "1 hour 30 minutes" + - `timedelta(hours=1, minutes=30)` - returns "1 hour 30 minutes" + + Raises: + TypeError: if the type is not supported a TypeError is raised + + Returns: + None | str: None if the time_entry is None, otherwise a string representing the time + """ + if not time_entry: return None - elif isinstance(time_entry, timedelta): - return pretty_print_timedelta(time_entry) - elif isinstance(time_entry, datetime): - pass - # print(time_entry) - elif isinstance(time_entry, str): - try: - time_delta_object = parse_duration(time_entry) - return pretty_print_timedelta(time_delta_object) - except ValueError: - logger.error(f"Could not parse time_entry `{time_entry}`") + + match time_entry: + case str(time_entry): + if not time_entry.strip(): + return None + + try: + time_delta_instructionsect = parse_duration(time_entry) + return pretty_print_timedelta(time_delta_instructionsect) + except ValueError: + return str(time_entry) + case timedelta(): + return pretty_print_timedelta(time_entry) + case datetime(): + # TODO: Not sure what to do here return str(time_entry) - else: - return str(time_entry) + case _: + raise TypeError(f"Unexpected type for time: {type(time_entry)}, {time_entry}") -def parse_duration(iso_duration): - """Parses an ISO 8601 duration string into a datetime.timedelta instance. +def parse_duration(iso_duration: str) -> timedelta: + """ + Parses an ISO 8601 duration string into a datetime.timedelta instance. + Args: iso_duration: an ISO 8601 duration string. - Returns: - a datetime.timedelta instance + + Raises: + ValueError: if the input string is not a valid ISO 8601 duration string. """ - m = re.match( - r"^P((\d+)Y)?((\d+)M)?((?P\d+)D)?" - r"T((?P\d+)H)?((?P\d+)M)?((?P\d+(?:\.\d+)?)S)?$", - iso_duration, - ) + + m = MATCH_ISO_STR.match(iso_duration) + if m is None: raise ValueError("invalid ISO 8601 duration string") @@ -257,7 +326,7 @@ def parse_duration(iso_duration): # convert parsed years and months to specific number of days. times = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0} - for unit, _ in times.items(): + for unit in times.keys(): if m.group(unit): times[unit] = int(float(m.group(unit))) @@ -299,30 +368,73 @@ def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places return " ".join(out_list) +def clean_categories(category: str | list) -> list[str]: + if not category: + return [] + + match category: + case str(category): + if not category.strip(): + return [] + + return [category] + case [str(), *_]: + return [cat.strip().title() for cat in category if cat.strip()] + case [{"name": str(), "slug": str()}, *_]: + # Special case for when we use the cleaner to cleanup a migration. + # + # [ + # { "name": "Dessert", "slug": "dessert"} + # ] + # + return [cat["name"] for cat in category if "name" in cat] + case _: + raise TypeError(f"Unexpected type for category: {type(category)}, {category}") + + def clean_tags(data: str | list[str]) -> list[str]: """ - Gets keywords as a list or natural language list and returns them into a list of strings of individual tags + Gets keywords as a list or natural language list and returns + them into a list of strings of individual tags """ - if data is None: + if not data: return [] - if isinstance(data, list): - all_str = True - i = 0 - while all_str and i < len(data): - all_str = isinstance(data[i], str) - i = i + 1 + match data: + case [str(), *_]: + return [tag.strip().title() for tag in data if tag.strip()] + case str(data): + return clean_tags([t for t in data.split(",")]) + case _: + return [] + # should probably raise exception + # raise TypeError(f"Unexpected type for tags: {type(data)}, {data}") - if all_str: - return data - return [] - if isinstance(data, str): - tag_list = data.split(",") +def clean_nutrition(nutrition: dict | None) -> dict[str, str]: + """ + clean_nutrition takes a dictionary of nutrition information and cleans it up + to be stored in the database. It will remove any keys that are not in the + list of valid keys - for i in range(len(tag_list)): - tag_list[i] = tag_list[i].strip().capitalize() + Assumptionas: + - All units are supplied in grams, expect sodium which maybe be in milligrams - return tag_list + Returns: + dict[str, str]: If the argument is None, or not a dictionary, an empty dictionary is returned + """ + if not isinstance(nutrition, dict): + return {} - return [] + output_nutrition = {} + for key, val in nutrition.items(): + with contextlib.suppress(AttributeError, TypeError): + if matched_digits := MATCH_DIGITS.search(val): + output_nutrition[key] = matched_digits.group(0).replace(",", ".") + + if sodium := nutrition.get("sodiumContent", None): + if isinstance(sodium, str) and "m" not in sodium and "g" in sodium: + with contextlib.suppress(AttributeError, TypeError): + output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000) + + return output_nutrition diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py index aba3f3615027..82bf35f8d819 100644 --- a/mealie/services/scraper/scraper_strategies.py +++ b/mealie/services/scraper/scraper_strategies.py @@ -123,7 +123,7 @@ class RecipeScraperPackage(ABCScraperStrategy): self.logger.debug(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}") - instruction_as_text = cleaner.instructions(instruction_as_text) + instruction_as_text = cleaner.clean_instructions(instruction_as_text) self.logger.debug(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}") @@ -147,7 +147,9 @@ class RecipeScraperPackage(ABCScraperStrategy): description=try_get_default(None, "description", "", cleaner.clean_string), nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition), recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), - recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient), + recipe_ingredient=try_get_default( + scraped_data.ingredients, "recipeIngredient", [""], cleaner.clean_ingredients + ), recipe_instructions=get_instructions(), total_time=try_get_default(None, "totalTime", None, cleaner.clean_time), prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time), diff --git a/tests/integration_tests/user_recipe_tests/test_recipe_crud.py b/tests/integration_tests/user_recipe_tests/test_recipe_crud.py index 96637eedeaa4..a671753cc6fb 100644 --- a/tests/integration_tests/user_recipe_tests/test_recipe_crud.py +++ b/tests/integration_tests/user_recipe_tests/test_recipe_crud.py @@ -128,20 +128,20 @@ def test_create_by_url_with_tags( response = api_client.get(api_routes.recipes_slug(slug), headers=unique_user.token) assert response.status_code == 200 - # Verifiy the tags are present + # Verifiy the tags are present and title cased expected_tags = { - "sauté", - "pea", - "noodle", - "udon noodle", - "ramen noodle", - "dinner", - "main", - "vegetarian", - "easy", - "quick", - "weeknight meals", - "web", + "Sauté", + "Pea", + "Noodle", + "Udon Noodle", + "Ramen Noodle", + "Dinner", + "Main", + "Vegetarian", + "Easy", + "Quick", + "Weeknight Meals", + "Web", } recipe = json.loads(response.text) diff --git a/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py b/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py new file mode 100644 index 000000000000..395b943016d2 --- /dev/null +++ b/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py @@ -0,0 +1,56 @@ +import json +import re +from pathlib import Path + +import pytest + +from mealie.services.scraper import cleaner +from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph +from tests import data as test_data + +# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45 +url_validation_regex = re.compile( + r"^(?:http|ftp)s?://" # http:// or https:// + r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... + r"localhost|" # localhost... + r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip + r"(?::\d+)?" # optional port + r"(?:/?|[/?]\S+)$", + re.IGNORECASE, +) + +test_cleaner_data = [ + (test_data.json_best_homemade_salsa_recipe, 2), + (test_data.json_blue_cheese_stuffed_turkey_meatballs_with_raspberry_balsamic_glaze_2, 3), + (test_data.json_bon_appetit, 8), + (test_data.json_chunky_apple_cake, 4), + (test_data.json_dairy_free_impossible_pumpkin_pie, 7), + (test_data.json_how_to_make_instant_pot_spaghetti, 8), + (test_data.json_instant_pot_chicken_and_potatoes, 4), + (test_data.json_instant_pot_kerala_vegetable_stew, 13), + (test_data.json_jalapeno_popper_dip, 4), + (test_data.json_microwave_sweet_potatoes_04783, 4), + (test_data.json_moroccan_skirt_steak_with_roasted_pepper_couscous, 4), + (test_data.json_pizza_knoblauch_champignon_paprika_vegan_html, 3), +] + + +@pytest.mark.parametrize("json_file,num_steps", test_cleaner_data) +def test_cleaner_clean(json_file: Path, num_steps): + recipe_data = cleaner.clean(json.loads(json_file.read_text())) + assert len(recipe_data["recipeInstructions"]) == num_steps + + +def test_html_with_recipe_data(): + path = test_data.html_healthy_pasta_bake_60759 + url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759" + + open_graph_strategy = RecipeScraperOpenGraph(url) + + recipe_data = open_graph_strategy.get_recipe_fields(path.read_text()) + + assert len(recipe_data["name"]) > 10 + assert len(recipe_data["slug"]) > 10 + assert recipe_data["orgURL"] == url + assert len(recipe_data["description"]) > 100 + assert url_validation_regex.match(recipe_data["image"]) diff --git a/tests/unit_tests/services_tests/scraper_tests/test_cleaner_parts.py b/tests/unit_tests/services_tests/scraper_tests/test_cleaner_parts.py new file mode 100644 index 000000000000..b5493da4b726 --- /dev/null +++ b/tests/unit_tests/services_tests/scraper_tests/test_cleaner_parts.py @@ -0,0 +1,541 @@ +from dataclasses import dataclass +from datetime import timedelta +from typing import Any + +import pytest + +from mealie.services.scraper import cleaner + + +@dataclass(slots=True) +class CleanerCase: + test_id: str + input: Any + expected: Any + exception: Any = None + + +clean_string_test_cases = ( + CleanerCase( + test_id="empty_string", + input="", + expected="", + ), + CleanerCase( + test_id="html", + input="

Hello World

", + expected="Hello World", + ), + CleanerCase( + test_id="no_change", + input="Hello World", + expected="Hello World", + ), + CleanerCase( + test_id="html_with_extra_closing_tag", + input="

Hello World

", + expected="Hello World", + ), + CleanerCase( + test_id="multiple_spaces", + input="Hello World", + expected="Hello World", + ), + CleanerCase( + test_id="tabs", + input="\tHello World\t", + expected="Hello World", + ), + CleanerCase( + test_id="nbsp", + input="\xa0Hello World\xa0", + expected="Hello World", + ), + CleanerCase( + test_id="list", + input=["Hello World", "Goodbye World"], + expected="Hello World", + ), + CleanerCase( + test_id="int", + input=1, + expected="1", + ), +) + + +@pytest.mark.parametrize("case", clean_string_test_cases, ids=(x.test_id for x in clean_string_test_cases)) +def test_cleaner_clean_string(case: CleanerCase) -> None: + assert case.expected == cleaner.clean_string(case.input) + + +image_cleaner_test_cases = ( + CleanerCase( + test_id="empty_string", + input="", + expected="no image", + ), + CleanerCase( + test_id="no_change", + input="https://example.com/image.jpg", + expected="https://example.com/image.jpg", + ), + CleanerCase( + test_id="dict with url key", + input={"url": "https://example.com/image.jpg"}, + expected="https://example.com/image.jpg", + ), + CleanerCase( + test_id="list of strings", + input=["https://example.com/image.jpg"], + expected="https://example.com/image.jpg", + ), +) + + +@pytest.mark.parametrize("case", image_cleaner_test_cases, ids=(x.test_id for x in image_cleaner_test_cases)) +def test_cleaner_image_cleaner(case: CleanerCase): + result = cleaner.clean_image(case.input) + assert case.expected == result + + +instruction_test_cases = ( + CleanerCase( + test_id="single string", + input="Instruction A\nInstruction B\nInstruction C", + expected=None, + ), + CleanerCase( + test_id="single string multiple newlines", + input="Instruction A\n\nInstruction B\n\nInstruction C", + expected=None, + ), + CleanerCase( + test_id="common list of dicts", + input=[ + {"text": "Instruction A"}, + {"text": "Instruction B"}, + {"text": "Instruction C"}, + ], + expected=None, + ), + CleanerCase( + test_id="dict with int keys", + input={ + 0: {"text": "Instruction A"}, + 1: {"text": "Instruction B"}, + 2: {"text": "Instruction C"}, + }, + expected=None, + ), + CleanerCase( + test_id="dict with str num keys", + input={ + "0": {"text": "Instruction A"}, + "1": {"text": "Instruction B"}, + "2": {"text": "Instruction C"}, + }, + expected=None, + ), + CleanerCase( + test_id="dict with str num keys", + input={ + "1": {"text": "Instruction A"}, + "2": {"text": "Instruction B"}, + "3": {"text": "Instruction C"}, + }, + expected=None, + ), + CleanerCase( + test_id="dict with str num keys", + input={ + 1: {"text": "Instruction A"}, + 2: {"text": "Instruction B"}, + 3: {"text": "Instruction C"}, + }, + expected=None, + ), + CleanerCase( + test_id="raw json str", + input='{"0": {"text": "Instruction A"}, "1": {"text": "Instruction B"}, "2": {"text": "Instruction C"}}', + expected=None, + ), + CleanerCase( + test_id="how to steps", + input=[ + { + "@type": "HowToSection", + "itemListElement": [ + { + "@type": "HowToStep", + "text": "Instruction A", + }, + { + "@type": "HowToStep", + "text": "Instruction B", + }, + ], + }, + { + "@type": "HowToSection", + "itemListElement": [ + { + "@type": "HowToStep", + "text": "Instruction C", + }, + ], + }, + ], + expected=None, + ), + CleanerCase( + test_id="excessive whitespace str (1)", + input="Instruction A\n\nInstruction B\n\nInstruction C\n\n", + expected=None, + ), + CleanerCase( + test_id="excessive whitespace str (2)", + input="Instruction A\nInstruction B\nInstruction C\n", + expected=None, + ), + CleanerCase( + test_id="excessive whitespace str (3)", + input="Instruction A\r\n\r\nInstruction B\r\n\r\nInstruction C\r\n\r\n", + expected=None, + ), + CleanerCase( + test_id="excessive whitespace str (4)", + input="Instruction A\r\nInstruction B\r\nInstruction C\r\n", + expected=None, + ), +) + + +@pytest.mark.parametrize("instructions", instruction_test_cases, ids=(x.test_id for x in instruction_test_cases)) +def test_cleaner_instructions(instructions: CleanerCase): + reuslt = cleaner.clean_instructions(instructions.input) + + expected = [ + {"text": "Instruction A"}, + {"text": "Instruction B"}, + {"text": "Instruction C"}, + ] + + assert reuslt == expected + + +ingredients_test_cases = ( + CleanerCase( + input="", + expected=[], + test_id="empty string", + ), + CleanerCase( + input="1 cup of flour", + expected=["1 cup of flour"], + test_id="single ingredient string", + ), + CleanerCase( + input=["1 cup of flour"], + expected=["1 cup of flour"], + test_id="single ingredient list", + ), + CleanerCase( + input=["1 cup of flour", "1 cup of sugar"], + expected=["1 cup of flour", "1 cup of sugar"], + test_id="multiple ingredient list", + ), + CleanerCase( + input={"0": "1 cup of flour", "1": "1 cup of sugar"}, + expected=None, + test_id="multiple ingredient dictionary", + exception=TypeError, + ), +) + + +@pytest.mark.parametrize("ingredients", ingredients_test_cases, ids=(x.test_id for x in ingredients_test_cases)) +def test_cleaner_clean_ingredients(ingredients: CleanerCase): + + if ingredients.exception: + with pytest.raises(ingredients.exception): + cleaner.clean_ingredients(ingredients.input) + + return + + assert ingredients.expected == cleaner.clean_ingredients(ingredients.input) + + +yield_test_cases = ( + CleanerCase( + test_id="empty string", + input="", + expected="", + ), + CleanerCase( + test_id="list of strings", + input=["Makes 4 Batches", "4 Batches"], + expected="4 Batches", + ), + CleanerCase( + test_id="basic string", + input="Makes 4 Batches", + expected="Makes 4 Batches", + ), + CleanerCase( + test_id="empty list", + input=[], + expected="", + ), +) + + +@pytest.mark.parametrize("case", yield_test_cases, ids=(x.test_id for x in yield_test_cases)) +def test_cleaner_clean_yield_amount(case: CleanerCase): + result = cleaner.clean_yield(case.input) + assert case.expected == result + + +time_test_cases = ( + CleanerCase( + test_id="empty string", + input="", + expected=None, + ), + CleanerCase( + test_id="emtpy whitespace", + input=" ", + expected=None, + ), + CleanerCase( + test_id="none", + input=None, + expected=None, + ), + CleanerCase( + test_id="invalid string", + input="invalid", + expected="invalid", + ), + CleanerCase( + test_id="timedelta", + input=timedelta(minutes=30), + expected="30 Minutes", + ), + CleanerCase( + test_id="timedelta string (1)", + input="PT2H30M", + expected="2 Hours 30 Minutes", + ), + CleanerCase( + test_id="timedelta string (2)", + input="PT30M", + expected="30 Minutes", + ), + CleanerCase( + test_id="timedelta string (3)", + input="PT2H", + expected="2 Hours", + ), + CleanerCase( + test_id="timedelta string (4)", + input="P1DT1H1M1S", + expected="1 day 1 Hour 1 Minute 1 Second", + ), + CleanerCase( + test_id="timedelta string (4)", + input="P1DT1H1M1.53S", + expected="1 day 1 Hour 1 Minute 1 Second", + ), + CleanerCase( + test_id="timedelta string (5) invalid", + input="PT", + expected="none", + ), + CleanerCase( + test_id="timedelta string (6) PT-3H", + input="PT-3H", + expected="PT-3H", + ), +) + + +@pytest.mark.parametrize("case", time_test_cases, ids=(x.test_id for x in time_test_cases)) +def test_cleaner_clean_time(case: CleanerCase): + result = cleaner.clean_time(case.input) + assert case.expected == result + + +category_test_cases = ( + CleanerCase( + test_id="empty string", + input="", + expected=[], + ), + CleanerCase( + test_id="emtpy whitespace", + input=" ", + expected=[], + ), + CleanerCase( + test_id="emtpy list", + input=[], + expected=[], + ), + CleanerCase( + test_id="single string", + input="Dessert", + expected=["Dessert"], + ), + CleanerCase( + test_id="nested dictionary", + input=[ + {"name": "Dessert", "slug": "dessert"}, + {"name": "Lunch", "slug": "lunch"}, + ], + expected=["Dessert", "Lunch"], + ), +) + + +@pytest.mark.parametrize("case", category_test_cases, ids=(x.test_id for x in category_test_cases)) +def test_cleaner_clean_categories(case: CleanerCase): + result = cleaner.clean_categories(case.input) + assert case.expected == result + + +tag_test_cases = ( + CleanerCase( + test_id="empty string", + input="", + expected=[], + ), + CleanerCase( + test_id="single tag", + input="tag", + expected=["Tag"], + ), + CleanerCase( + test_id="comma separated tags", + input="tag1, tag2, tag3", + expected=["Tag1", "Tag2", "Tag3"], + ), + CleanerCase( + test_id="list of tags", + input=["tag1", "tag2", "tag3"], + expected=["Tag1", "Tag2", "Tag3"], + ), +) + + +@pytest.mark.parametrize("case", tag_test_cases, ids=(x.test_id for x in tag_test_cases)) +def test_cleaner_clean_tags(case: CleanerCase): + result = cleaner.clean_tags(case.input) + assert case.expected == result + + +nutrition_test_cases = ( + CleanerCase( + test_id="empty dict", + input={}, + expected={}, + ), + CleanerCase( + test_id="valid kets", + input={ + "calories": "100mg", + "fatContent": "10", + }, + expected={ + "calories": "100", + "fatContent": "10", + }, + ), + CleanerCase( + test_id="invalid keys get removed", + input={ + "calories": "100mg", + "fatContent": "10", + "invalid": "invalid", + }, + expected={ + "calories": "100", + "fatContent": "10", + }, + ), + CleanerCase( + test_id="support `,` seperated numbers instead of `.` (common in Europe)", + input={ + "calories": "100,000mg", + "fatContent": "10,000", + }, + expected={ + "calories": "100.000", + "fatContent": "10.000", + }, + ), + CleanerCase( + test_id="special support for sodiumContent (g -> mg)", + input={ + "sodiumContent": "10g", + }, + expected={ + "sodiumContent": "10000.0", + }, + ), + CleanerCase( + test_id="special support for sodiumContent (mg -> mg)", + input={ + "sodiumContent": "10000mg", + }, + expected={ + "sodiumContent": "10000", + }, + ), + CleanerCase( + test_id="strip units", + input={ + "calories": "100 kcal", + }, + expected={ + "calories": "100", + }, + ), + CleanerCase( + test_id="list as value continues after first value", + input={ + "calories": ["100 kcal"], + "sugarContent": "but still tries 555.321", + }, + expected={ + "sugarContent": "555.321", + }, + ), + CleanerCase( + test_id="multiple decimals", + input={ + "sodiumContent": "10.1.2g", + }, + expected={ + "sodiumContent": "10100.0", + }, + ), +) + + +@pytest.mark.parametrize("case", nutrition_test_cases, ids=(x.test_id for x in nutrition_test_cases)) +def test_cleaner_clean_nutrition(case: CleanerCase): + result = cleaner.clean_nutrition(case.input) + assert case.expected == result + + +@pytest.mark.parametrize( + "t,max_components,max_decimal_places,expected", + [ + (timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"), + (timedelta(days=2, seconds=17280), 1, 2, "2.2 days"), + (timedelta(days=365), None, 2, "1 year"), + ], +) +def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected): + assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected diff --git a/tests/unit_tests/test_cleaner.py b/tests/unit_tests/test_cleaner.py deleted file mode 100644 index 733aa6f3e14d..000000000000 --- a/tests/unit_tests/test_cleaner.py +++ /dev/null @@ -1,140 +0,0 @@ -import json -import re -from datetime import timedelta -from pathlib import Path - -import pytest - -from mealie.services.scraper import cleaner -from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph -from tests import data as test_data - -# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45 -url_validation_regex = re.compile( - r"^(?:http|ftp)s?://" # http:// or https:// - r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... - r"localhost|" # localhost... - r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip - r"(?::\d+)?" # optional port - r"(?:/?|[/?]\S+)$", - re.IGNORECASE, -) - -test_cleaner_data = [ - (test_data.json_best_homemade_salsa_recipe, 2), - (test_data.json_blue_cheese_stuffed_turkey_meatballs_with_raspberry_balsamic_glaze_2, 3), - (test_data.json_bon_appetit, 8), - (test_data.json_chunky_apple_cake, 4), - (test_data.json_dairy_free_impossible_pumpkin_pie, 7), - (test_data.json_how_to_make_instant_pot_spaghetti, 8), - (test_data.json_instant_pot_chicken_and_potatoes, 4), - (test_data.json_instant_pot_kerala_vegetable_stew, 13), - (test_data.json_jalapeno_popper_dip, 4), - (test_data.json_microwave_sweet_potatoes_04783, 4), - (test_data.json_moroccan_skirt_steak_with_roasted_pepper_couscous, 4), - (test_data.json_pizza_knoblauch_champignon_paprika_vegan_html, 3), -] - - -@pytest.mark.parametrize( - "json_file,num_steps", - test_cleaner_data, -) -def test_cleaner_clean(json_file: Path, num_steps): - recipe_data = cleaner.clean(json.loads(json_file.read_text())) - assert len(recipe_data["recipeInstructions"]) == num_steps - - -def test_clean_category(): - assert cleaner.category("my-category") == ["my-category"] - - -def test_clean_string(): - assert cleaner.clean_string("
Hello World
") == "Hello World" - - -def test_clean_image(): - assert cleaner.image(None) == "no image" - assert cleaner.image("https://my.image/path/") == "https://my.image/path/" - assert cleaner.image({"url": "My URL!"}) == "My URL!" - assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" - - -@pytest.mark.parametrize( - "nutrition,expected", - [ - (None, {}), - ({"calories": "105 kcal"}, {"calories": "105"}), - ({"calories": "105 kcal 104 sugar"}, {"calories": "105"}), - ({"calories": ""}, {}), - ({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}), - ({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}), - ({"sodiumContent": "5mg"}, {"sodiumContent": "5"}), - ({"sodiumContent": "10oz"}, {"sodiumContent": "10"}), - ({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}), - ], -) -def test_clean_nutrition(nutrition, expected): - assert cleaner.clean_nutrition(nutrition) == expected - - -@pytest.mark.parametrize( - "instructions", - [ - "A\n\nB\n\nC\n\n", - "A\nB\nC\n", - "A\r\n\r\nB\r\n\r\nC\r\n\r\n", - "A\r\nB\r\nC\r\n", - ["A", "B", "C"], - [{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]], - ], -) -def test_cleaner_instructions(instructions): - assert cleaner.instructions(instructions) == [ - {"text": "A"}, - {"text": "B"}, - {"text": "C"}, - ] - - -def test_html_with_recipe_data(): - path = test_data.html_healthy_pasta_bake_60759 - url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759" - - open_graph_strategy = RecipeScraperOpenGraph(url) - - recipe_data = open_graph_strategy.get_recipe_fields(path.read_text()) - - assert len(recipe_data["name"]) > 10 - assert len(recipe_data["slug"]) > 10 - assert recipe_data["orgURL"] == url - assert len(recipe_data["description"]) > 100 - assert url_validation_regex.match(recipe_data["image"]) - - -@pytest.mark.parametrize( - "time_delta,expected", - [ - ("PT2H30M", "2 Hours 30 Minutes"), - ("PT30M", "30 Minutes"), - ("PT3H", "3 Hours"), - ("P1DT1H1M1S", "1 day 1 Hour 1 Minute 1 Second"), - ("P1DT1H1M1.53S", "1 day 1 Hour 1 Minute 1 Second"), - ("PT-3H", "PT-3H"), - ("PT", "none"), - ], -) -def test_time_cleaner(time_delta, expected): - assert cleaner.clean_time(time_delta) == expected - - -@pytest.mark.parametrize( - "t,max_components,max_decimal_places,expected", - [ - (timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"), - (timedelta(days=2, seconds=17280), 1, 2, "2.2 days"), - (timedelta(days=365), None, 2, "1 year"), - ], -) -def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected): - assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected