mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-07-09 03:04:54 -04:00
refactor: rewrite cleaner functions for parsing recipe dicts (#1743)
* rewrite cleaner functions * unify verbage * try importing dep during check * fix syntax * allow override defaults * satisfy mypy
This commit is contained in:
parent
77316d639b
commit
89d0cae51d
2
.github/workflows/partial-backend.yml
vendored
2
.github/workflows/partial-backend.yml
vendored
@ -54,7 +54,7 @@ jobs:
|
||||
id: cache-validate
|
||||
if: steps.cached-poetry-dependencies.outputs.cache-hit == 'true'
|
||||
run: |
|
||||
echo "print('venv good?')" > test.py && poetry run python test.py && echo ::set-output name=cache-hit-success::true
|
||||
echo "import black;print('venv good?')" > test.py && poetry run python test.py && echo ::set-output name=cache-hit-success::true
|
||||
rm test.py
|
||||
continue-on-error: true
|
||||
|
||||
|
@ -1,14 +1,30 @@
|
||||
import contextlib
|
||||
import functools
|
||||
import html
|
||||
import json
|
||||
import operator
|
||||
import re
|
||||
import typing
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from slugify import slugify
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
MATCH_DIGITS = re.compile(r"\d+([.,]\d+)?")
|
||||
""" Allow for commas as decimals (common in Europe) """
|
||||
|
||||
logger = get_logger()
|
||||
MATCH_ISO_STR = re.compile(
|
||||
r"^P((\d+)Y)?((\d+)M)?((?P<days>\d+)D)?" r"T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+(?:\.\d+)?)S)?$",
|
||||
)
|
||||
""" Match Duration Strings """
|
||||
|
||||
MATCH_HTML_TAGS = re.compile(r"<[^<]+?>")
|
||||
""" Matches HTML tags `<p>Text</p>` -> `Text` """
|
||||
|
||||
MATCH_MULTI_SPACE = re.compile(r" +")
|
||||
""" Matches multiple spaces `Hello World` -> `Hello World` """
|
||||
|
||||
MATCH_ERRONEOUS_WHITE_SPACE = re.compile(r"\n\s*\n")
|
||||
""" Matches multiple new lines and removes erroneous white space """
|
||||
|
||||
|
||||
def clean(recipe_data: dict, url=None) -> dict:
|
||||
@ -27,167 +43,167 @@ def clean(recipe_data: dict, url=None) -> dict:
|
||||
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
|
||||
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
|
||||
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
|
||||
recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", []))
|
||||
|
||||
recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield"))
|
||||
recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient"))
|
||||
recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions"))
|
||||
recipe_data["image"] = image(recipe_data.get("image"))
|
||||
recipe_data["slug"] = slugify(recipe_data.get("name")) # type: ignore
|
||||
recipe_data["recipeCategory"] = clean_categories(recipe_data.get("recipeCategory", []))
|
||||
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
|
||||
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
|
||||
recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
|
||||
recipe_data["image"] = clean_image(recipe_data.get("image"))
|
||||
recipe_data["slug"] = slugify(recipe_data.get("name", ""))
|
||||
recipe_data["orgURL"] = url
|
||||
|
||||
return recipe_data
|
||||
|
||||
|
||||
def clean_string(text: str) -> str:
|
||||
if isinstance(text, list):
|
||||
text = text[0]
|
||||
def clean_string(text: str | list | int) -> str:
|
||||
"""Cleans a string of HTML tags and extra white space"""
|
||||
if not isinstance(text, str):
|
||||
if isinstance(text, list):
|
||||
text = text[0]
|
||||
|
||||
if isinstance(text, int):
|
||||
text = str(text)
|
||||
if isinstance(text, int):
|
||||
text = str(text)
|
||||
|
||||
if text == "" or text is None:
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
text = typing.cast(str, text) # at this point we know text is a string
|
||||
|
||||
cleaned_text = html.unescape(text)
|
||||
cleaned_text = re.sub("<[^<]+?>", "", cleaned_text)
|
||||
cleaned_text = re.sub(" +", " ", cleaned_text)
|
||||
cleaned_text = re.sub("</p>", "\n", cleaned_text)
|
||||
cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text)
|
||||
cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip()
|
||||
cleaned_text = MATCH_HTML_TAGS.sub("", cleaned_text)
|
||||
cleaned_text = MATCH_MULTI_SPACE.sub(" ", cleaned_text)
|
||||
cleaned_text = MATCH_ERRONEOUS_WHITE_SPACE.sub("\n\n", cleaned_text)
|
||||
|
||||
cleaned_text = cleaned_text.replace("</p>", "\n").replace("\xa0", " ").replace("\t", " ").strip()
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def category(category: str):
|
||||
if isinstance(category, list) and len(category) > 0 and isinstance(category[0], dict):
|
||||
# If the category is a list of dicts, it's probably from a migration
|
||||
# validate that the required fields are present
|
||||
valid = []
|
||||
for cat in category:
|
||||
if "name" in cat and "slug" in cat:
|
||||
valid.append(cat)
|
||||
def clean_image(image: str | list | dict | None = None, default="no image") -> str:
|
||||
"""
|
||||
image attempts to parse the image field from a recipe and return a string. Currenty
|
||||
|
||||
return valid
|
||||
Supported Structures:
|
||||
- `["https://exmaple.com"]` - A list of strings
|
||||
- `https://exmaple.com` - A string
|
||||
- `{ "url": "https://exmaple.com"` - A dictionary with a `url` key
|
||||
|
||||
if isinstance(category, str) and category != "":
|
||||
return [category]
|
||||
Raises:
|
||||
TypeError: If the image field is not a supported type a TypeError is raised.
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]:
|
||||
# Assumes that all units are supplied in grams, except sodium which may be in mg.
|
||||
|
||||
# Fn only expects a dict[str,str]. Other structures should not be parsed.
|
||||
if not isinstance(nutrition, dict):
|
||||
return {}
|
||||
|
||||
# Allow for commas as decimals (common in Europe)
|
||||
# Compile once for efficiency
|
||||
re_match_digits = re.compile(r"\d+([.,]\d+)?")
|
||||
|
||||
output_nutrition = {}
|
||||
for key, val in nutrition.items():
|
||||
# If the val contains digits matching the regex, add the first match to the output dict.
|
||||
# Handle unexpected datastructures safely.
|
||||
try:
|
||||
if matched_digits := re_match_digits.search(val):
|
||||
output_nutrition[key] = matched_digits.group(0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()}
|
||||
|
||||
if (
|
||||
"sodiumContent" in nutrition
|
||||
and type(nutrition["sodiumContent"]) == str
|
||||
and "m" not in nutrition["sodiumContent"]
|
||||
and "g" in nutrition["sodiumContent"]
|
||||
):
|
||||
# Sodium is in grams. Parse its value, multiple by 1k and return to string.
|
||||
try:
|
||||
output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
|
||||
except ValueError:
|
||||
# Could not parse sodium content as float, so don't touch it.
|
||||
pass
|
||||
|
||||
return output_nutrition
|
||||
|
||||
|
||||
def image(image=None) -> str:
|
||||
Returns:
|
||||
str: "no image" if any empty string is provided or the url of the image
|
||||
"""
|
||||
if not image:
|
||||
return "no image"
|
||||
if isinstance(image, list):
|
||||
return image[0]
|
||||
elif isinstance(image, dict):
|
||||
return image["url"]
|
||||
elif isinstance(image, str):
|
||||
return image
|
||||
else:
|
||||
raise Exception(f"Unrecognised image URL format: {image}")
|
||||
return default
|
||||
|
||||
match image:
|
||||
case str(image):
|
||||
return image
|
||||
case list(image):
|
||||
return image[0]
|
||||
case {"url": str(image)}:
|
||||
return image
|
||||
case _:
|
||||
raise TypeError(f"Unexpected type for image: {type(image)}, {image}")
|
||||
|
||||
|
||||
def instructions(instructions) -> list[dict]:
|
||||
try:
|
||||
instructions = json.loads(instructions)
|
||||
except Exception:
|
||||
pass
|
||||
def clean_instructions(steps_object: list | dict | str, default: list | None = None) -> list[dict]:
|
||||
"""
|
||||
instructions attempts to parse the instructions field from a recipe and return a list of
|
||||
dictionaries. See match statement for supported types and structures
|
||||
|
||||
if not instructions:
|
||||
return []
|
||||
Raises:
|
||||
TypeError: If the instructions field is not a supported type a TypeError is raised.
|
||||
|
||||
# Dictionary (Keys: step number strings, Values: the instructions)
|
||||
if isinstance(instructions, dict):
|
||||
instructions = list(instructions.values())
|
||||
Returns:
|
||||
list[dict]: An ordered list of dictionaries with the keys `text`
|
||||
"""
|
||||
if not steps_object:
|
||||
return default or []
|
||||
|
||||
if isinstance(instructions, list) and isinstance(instructions[0], list):
|
||||
instructions = instructions[0]
|
||||
|
||||
# One long string split by (possibly multiple) new lines
|
||||
if isinstance(instructions, str):
|
||||
return [{"text": _instruction(line)} for line in instructions.splitlines() if line]
|
||||
|
||||
# Plain strings in a list
|
||||
elif isinstance(instructions, list) and isinstance(instructions[0], str):
|
||||
return [{"text": _instruction(step)} for step in instructions]
|
||||
|
||||
# Dictionaries (let's assume it's a HowToStep) in a list
|
||||
elif isinstance(instructions, list) and isinstance(instructions[0], dict):
|
||||
# Try List of Dictionary without "@type" or "type"
|
||||
if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
|
||||
return [{"text": _instruction(step["text"])} for step in instructions]
|
||||
|
||||
try:
|
||||
# If HowToStep is under HowToSection
|
||||
sectionSteps = []
|
||||
for step in instructions:
|
||||
if step["@type"] == "HowToSection":
|
||||
for sectionStep in step["itemListElement"]:
|
||||
sectionSteps.append(sectionStep)
|
||||
|
||||
if len(sectionSteps) > 0:
|
||||
return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"]
|
||||
|
||||
return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"]
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
# Not "@type", try "type"
|
||||
try:
|
||||
return [
|
||||
{"text": _instruction(step["properties"]["text"])}
|
||||
for step in instructions
|
||||
if step["type"].find("HowToStep") > -1
|
||||
]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
else:
|
||||
raise Exception(f"Unrecognised instruction format: {instructions}")
|
||||
|
||||
return []
|
||||
match steps_object:
|
||||
case [{"text": str()}]: # Base Case
|
||||
return steps_object
|
||||
case [{"text": str()}, *_]:
|
||||
# The is the most common case. Most other operations eventually resolve to this
|
||||
# match case before being converted to a list of instructions
|
||||
#
|
||||
# [
|
||||
# {"text": "Instruction A"},
|
||||
# {"text": "Instruction B"},
|
||||
# ]
|
||||
#
|
||||
return [
|
||||
{"text": _sanitize_instruction_text(instruction["text"])}
|
||||
for instruction in steps_object
|
||||
if instruction["text"].strip()
|
||||
]
|
||||
case {0: {"text": str()}} | {"0": {"text": str()}} | {1: {"text": str()}} | {"1": {"text": str()}}:
|
||||
# Some recipes have a dict with a string key representing the index, unsure if these can
|
||||
# be an int or not so we match against both. Additionally, we match against both 0 and 1 indexed
|
||||
# list like dicts.
|
||||
#
|
||||
# {
|
||||
# "0": {"text": "Instruction A"},
|
||||
# "1": {"text": "Instruction B"},
|
||||
# }
|
||||
#
|
||||
steps_object = typing.cast(dict, steps_object)
|
||||
return clean_instructions([x for x in steps_object.values()])
|
||||
case str(step_as_str):
|
||||
# Strings are weird, some sites return a single string with newlines
|
||||
# others returns a json string for some reasons
|
||||
#
|
||||
# "Instruction A\nInstruction B\nInstruction C"
|
||||
# '{"0": {"text": "Instruction A"}, "1": {"text": "Instruction B"}, "2": {"text": "Instruction C"}}'
|
||||
#
|
||||
if step_as_str.startswith("[") or step_as_str.startswith("{"):
|
||||
try:
|
||||
return clean_instructions(json.loads(step_as_str))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return [
|
||||
{"text": _sanitize_instruction_text(instruction)}
|
||||
for instruction in step_as_str.splitlines()
|
||||
if instruction.strip()
|
||||
]
|
||||
case [str(), *_]:
|
||||
# Assume list of strings is a valid list of instructions
|
||||
#
|
||||
# [
|
||||
# "Instruction A",
|
||||
# "Instruction B",
|
||||
# ]
|
||||
#
|
||||
return [
|
||||
{"text": _sanitize_instruction_text(instruction)} for instruction in steps_object if instruction.strip()
|
||||
]
|
||||
case [{"@type": "HowToSection"}, *_] | [{"type": "HowToSection"}, *_]:
|
||||
# HowToSections should have the following layout,
|
||||
# {
|
||||
# "@type": "HowToSection",
|
||||
# "itemListElement": [
|
||||
# {
|
||||
# "@type": "HowToStep",
|
||||
# "text": "Instruction A"
|
||||
# },
|
||||
# }
|
||||
#
|
||||
steps_object = typing.cast(list[dict[str, str]], steps_object)
|
||||
return clean_instructions(functools.reduce(operator.concat, [x["itemListElement"] for x in steps_object], [])) # type: ignore
|
||||
case _:
|
||||
raise TypeError(f"Unexpected type for instructions: {type(steps_object)}, {steps_object}")
|
||||
|
||||
|
||||
def _instruction(line) -> str:
|
||||
def _sanitize_instruction_text(line: str | dict) -> str:
|
||||
"""
|
||||
_sanitize_instructions_text does some basic checking if the value is a string or dictionary
|
||||
and returns the value of the `text` key if it is a dictionary. The returned string is passed through the
|
||||
`clean_string` function to remove any html tags and extra whitespace in a loop until the string
|
||||
is stable.
|
||||
|
||||
Calling `clean_string` in a loop is necessary because some sites return a string with erroneously escaped
|
||||
html tags or markup.
|
||||
"""
|
||||
if isinstance(line, dict):
|
||||
# Some Recipes dotnot adhear to schema
|
||||
try:
|
||||
@ -195,58 +211,111 @@ def _instruction(line) -> str:
|
||||
except Exception:
|
||||
line = ""
|
||||
|
||||
if not line:
|
||||
return ""
|
||||
|
||||
line = typing.cast(str, line)
|
||||
clean_line = clean_string(line.strip())
|
||||
# Some sites erroneously escape their strings on multiple levels
|
||||
|
||||
while not clean_line == (clean_line := clean_string(clean_line)):
|
||||
pass
|
||||
|
||||
return clean_line
|
||||
|
||||
|
||||
def ingredient(ingredients: list | None) -> list[str]:
|
||||
if ingredients:
|
||||
return [clean_string(ing) for ing in ingredients]
|
||||
else:
|
||||
return []
|
||||
def clean_ingredients(ingredients: list | str | None, default: list = None) -> list[str]:
|
||||
"""
|
||||
ingredient attempts to parse the ingredients field from a recipe and return a list of
|
||||
|
||||
Supported Structures:
|
||||
- `["1 cup flour"]` - A list of strings
|
||||
- `"1 cup flour"` - A string
|
||||
- `None` - returns an empty list
|
||||
|
||||
Raises:
|
||||
TypeError: If the ingredients field is not a supported type a TypeError is raised.
|
||||
"""
|
||||
match ingredients:
|
||||
case None:
|
||||
return default or []
|
||||
case list(ingredients):
|
||||
return [clean_string(ingredient) for ingredient in ingredients]
|
||||
case str(ingredients):
|
||||
return [clean_string(ingredient) for ingredient in ingredients.splitlines()]
|
||||
case _:
|
||||
raise TypeError(f"Unexpected type for ingredients: {type(ingredients)}, {ingredients}")
|
||||
|
||||
|
||||
def yield_amount(yld) -> str:
|
||||
def clean_yield(yld: str | list[str] | None) -> str:
|
||||
"""
|
||||
yield_amount attemps to parse out the yield amount from a recipe.
|
||||
|
||||
Supported Structures:
|
||||
- `"4 servings"` - returns the string unmodified
|
||||
- `["4 servings", "4 Pies"]` - returns the last value
|
||||
|
||||
Returns:
|
||||
str: The yield amount, if it can be parsed else an empty string
|
||||
"""
|
||||
if not yld:
|
||||
return ""
|
||||
|
||||
if isinstance(yld, list):
|
||||
return yld[-1]
|
||||
else:
|
||||
return yld
|
||||
|
||||
return yld
|
||||
|
||||
|
||||
def clean_time(time_entry):
|
||||
if time_entry is None or time_entry == "" or time_entry == " ":
|
||||
def clean_time(time_entry: str | timedelta | None) -> None | str:
|
||||
"""_summary_
|
||||
|
||||
Supported Structures:
|
||||
- `None` - returns None
|
||||
- `"PT1H"` - returns "1 hour"
|
||||
- `"PT1H30M"` - returns "1 hour 30 minutes"
|
||||
- `timedelta(hours=1, minutes=30)` - returns "1 hour 30 minutes"
|
||||
|
||||
Raises:
|
||||
TypeError: if the type is not supported a TypeError is raised
|
||||
|
||||
Returns:
|
||||
None | str: None if the time_entry is None, otherwise a string representing the time
|
||||
"""
|
||||
if not time_entry:
|
||||
return None
|
||||
elif isinstance(time_entry, timedelta):
|
||||
return pretty_print_timedelta(time_entry)
|
||||
elif isinstance(time_entry, datetime):
|
||||
pass
|
||||
# print(time_entry)
|
||||
elif isinstance(time_entry, str):
|
||||
try:
|
||||
time_delta_object = parse_duration(time_entry)
|
||||
return pretty_print_timedelta(time_delta_object)
|
||||
except ValueError:
|
||||
logger.error(f"Could not parse time_entry `{time_entry}`")
|
||||
|
||||
match time_entry:
|
||||
case str(time_entry):
|
||||
if not time_entry.strip():
|
||||
return None
|
||||
|
||||
try:
|
||||
time_delta_instructionsect = parse_duration(time_entry)
|
||||
return pretty_print_timedelta(time_delta_instructionsect)
|
||||
except ValueError:
|
||||
return str(time_entry)
|
||||
case timedelta():
|
||||
return pretty_print_timedelta(time_entry)
|
||||
case datetime():
|
||||
# TODO: Not sure what to do here
|
||||
return str(time_entry)
|
||||
else:
|
||||
return str(time_entry)
|
||||
case _:
|
||||
raise TypeError(f"Unexpected type for time: {type(time_entry)}, {time_entry}")
|
||||
|
||||
|
||||
def parse_duration(iso_duration):
|
||||
"""Parses an ISO 8601 duration string into a datetime.timedelta instance.
|
||||
def parse_duration(iso_duration: str) -> timedelta:
|
||||
"""
|
||||
Parses an ISO 8601 duration string into a datetime.timedelta instance.
|
||||
|
||||
Args:
|
||||
iso_duration: an ISO 8601 duration string.
|
||||
Returns:
|
||||
a datetime.timedelta instance
|
||||
|
||||
Raises:
|
||||
ValueError: if the input string is not a valid ISO 8601 duration string.
|
||||
"""
|
||||
m = re.match(
|
||||
r"^P((\d+)Y)?((\d+)M)?((?P<days>\d+)D)?"
|
||||
r"T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+(?:\.\d+)?)S)?$",
|
||||
iso_duration,
|
||||
)
|
||||
|
||||
m = MATCH_ISO_STR.match(iso_duration)
|
||||
|
||||
if m is None:
|
||||
raise ValueError("invalid ISO 8601 duration string")
|
||||
|
||||
@ -257,7 +326,7 @@ def parse_duration(iso_duration):
|
||||
# convert parsed years and months to specific number of days.
|
||||
|
||||
times = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0}
|
||||
for unit, _ in times.items():
|
||||
for unit in times.keys():
|
||||
if m.group(unit):
|
||||
times[unit] = int(float(m.group(unit)))
|
||||
|
||||
@ -299,30 +368,73 @@ def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places
|
||||
return " ".join(out_list)
|
||||
|
||||
|
||||
def clean_categories(category: str | list) -> list[str]:
|
||||
if not category:
|
||||
return []
|
||||
|
||||
match category:
|
||||
case str(category):
|
||||
if not category.strip():
|
||||
return []
|
||||
|
||||
return [category]
|
||||
case [str(), *_]:
|
||||
return [cat.strip().title() for cat in category if cat.strip()]
|
||||
case [{"name": str(), "slug": str()}, *_]:
|
||||
# Special case for when we use the cleaner to cleanup a migration.
|
||||
#
|
||||
# [
|
||||
# { "name": "Dessert", "slug": "dessert"}
|
||||
# ]
|
||||
#
|
||||
return [cat["name"] for cat in category if "name" in cat]
|
||||
case _:
|
||||
raise TypeError(f"Unexpected type for category: {type(category)}, {category}")
|
||||
|
||||
|
||||
def clean_tags(data: str | list[str]) -> list[str]:
|
||||
"""
|
||||
Gets keywords as a list or natural language list and returns them into a list of strings of individual tags
|
||||
Gets keywords as a list or natural language list and returns
|
||||
them into a list of strings of individual tags
|
||||
"""
|
||||
if data is None:
|
||||
if not data:
|
||||
return []
|
||||
|
||||
if isinstance(data, list):
|
||||
all_str = True
|
||||
i = 0
|
||||
while all_str and i < len(data):
|
||||
all_str = isinstance(data[i], str)
|
||||
i = i + 1
|
||||
match data:
|
||||
case [str(), *_]:
|
||||
return [tag.strip().title() for tag in data if tag.strip()]
|
||||
case str(data):
|
||||
return clean_tags([t for t in data.split(",")])
|
||||
case _:
|
||||
return []
|
||||
# should probably raise exception
|
||||
# raise TypeError(f"Unexpected type for tags: {type(data)}, {data}")
|
||||
|
||||
if all_str:
|
||||
return data
|
||||
return []
|
||||
|
||||
if isinstance(data, str):
|
||||
tag_list = data.split(",")
|
||||
def clean_nutrition(nutrition: dict | None) -> dict[str, str]:
|
||||
"""
|
||||
clean_nutrition takes a dictionary of nutrition information and cleans it up
|
||||
to be stored in the database. It will remove any keys that are not in the
|
||||
list of valid keys
|
||||
|
||||
for i in range(len(tag_list)):
|
||||
tag_list[i] = tag_list[i].strip().capitalize()
|
||||
Assumptionas:
|
||||
- All units are supplied in grams, expect sodium which maybe be in milligrams
|
||||
|
||||
return tag_list
|
||||
Returns:
|
||||
dict[str, str]: If the argument is None, or not a dictionary, an empty dictionary is returned
|
||||
"""
|
||||
if not isinstance(nutrition, dict):
|
||||
return {}
|
||||
|
||||
return []
|
||||
output_nutrition = {}
|
||||
for key, val in nutrition.items():
|
||||
with contextlib.suppress(AttributeError, TypeError):
|
||||
if matched_digits := MATCH_DIGITS.search(val):
|
||||
output_nutrition[key] = matched_digits.group(0).replace(",", ".")
|
||||
|
||||
if sodium := nutrition.get("sodiumContent", None):
|
||||
if isinstance(sodium, str) and "m" not in sodium and "g" in sodium:
|
||||
with contextlib.suppress(AttributeError, TypeError):
|
||||
output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
|
||||
|
||||
return output_nutrition
|
||||
|
@ -123,7 +123,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
||||
|
||||
self.logger.debug(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
|
||||
|
||||
instruction_as_text = cleaner.instructions(instruction_as_text)
|
||||
instruction_as_text = cleaner.clean_instructions(instruction_as_text)
|
||||
|
||||
self.logger.debug(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
|
||||
|
||||
@ -147,7 +147,9 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
||||
description=try_get_default(None, "description", "", cleaner.clean_string),
|
||||
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
|
||||
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
|
||||
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
|
||||
recipe_ingredient=try_get_default(
|
||||
scraped_data.ingredients, "recipeIngredient", [""], cleaner.clean_ingredients
|
||||
),
|
||||
recipe_instructions=get_instructions(),
|
||||
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
|
||||
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
|
||||
|
@ -128,20 +128,20 @@ def test_create_by_url_with_tags(
|
||||
response = api_client.get(api_routes.recipes_slug(slug), headers=unique_user.token)
|
||||
assert response.status_code == 200
|
||||
|
||||
# Verifiy the tags are present
|
||||
# Verifiy the tags are present and title cased
|
||||
expected_tags = {
|
||||
"sauté",
|
||||
"pea",
|
||||
"noodle",
|
||||
"udon noodle",
|
||||
"ramen noodle",
|
||||
"dinner",
|
||||
"main",
|
||||
"vegetarian",
|
||||
"easy",
|
||||
"quick",
|
||||
"weeknight meals",
|
||||
"web",
|
||||
"Sauté",
|
||||
"Pea",
|
||||
"Noodle",
|
||||
"Udon Noodle",
|
||||
"Ramen Noodle",
|
||||
"Dinner",
|
||||
"Main",
|
||||
"Vegetarian",
|
||||
"Easy",
|
||||
"Quick",
|
||||
"Weeknight Meals",
|
||||
"Web",
|
||||
}
|
||||
|
||||
recipe = json.loads(response.text)
|
||||
|
@ -0,0 +1,56 @@
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from mealie.services.scraper import cleaner
|
||||
from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph
|
||||
from tests import data as test_data
|
||||
|
||||
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
|
||||
url_validation_regex = re.compile(
|
||||
r"^(?:http|ftp)s?://" # http:// or https://
|
||||
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
|
||||
r"localhost|" # localhost...
|
||||
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
|
||||
r"(?::\d+)?" # optional port
|
||||
r"(?:/?|[/?]\S+)$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
test_cleaner_data = [
|
||||
(test_data.json_best_homemade_salsa_recipe, 2),
|
||||
(test_data.json_blue_cheese_stuffed_turkey_meatballs_with_raspberry_balsamic_glaze_2, 3),
|
||||
(test_data.json_bon_appetit, 8),
|
||||
(test_data.json_chunky_apple_cake, 4),
|
||||
(test_data.json_dairy_free_impossible_pumpkin_pie, 7),
|
||||
(test_data.json_how_to_make_instant_pot_spaghetti, 8),
|
||||
(test_data.json_instant_pot_chicken_and_potatoes, 4),
|
||||
(test_data.json_instant_pot_kerala_vegetable_stew, 13),
|
||||
(test_data.json_jalapeno_popper_dip, 4),
|
||||
(test_data.json_microwave_sweet_potatoes_04783, 4),
|
||||
(test_data.json_moroccan_skirt_steak_with_roasted_pepper_couscous, 4),
|
||||
(test_data.json_pizza_knoblauch_champignon_paprika_vegan_html, 3),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("json_file,num_steps", test_cleaner_data)
|
||||
def test_cleaner_clean(json_file: Path, num_steps):
|
||||
recipe_data = cleaner.clean(json.loads(json_file.read_text()))
|
||||
assert len(recipe_data["recipeInstructions"]) == num_steps
|
||||
|
||||
|
||||
def test_html_with_recipe_data():
|
||||
path = test_data.html_healthy_pasta_bake_60759
|
||||
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
|
||||
|
||||
open_graph_strategy = RecipeScraperOpenGraph(url)
|
||||
|
||||
recipe_data = open_graph_strategy.get_recipe_fields(path.read_text())
|
||||
|
||||
assert len(recipe_data["name"]) > 10
|
||||
assert len(recipe_data["slug"]) > 10
|
||||
assert recipe_data["orgURL"] == url
|
||||
assert len(recipe_data["description"]) > 100
|
||||
assert url_validation_regex.match(recipe_data["image"])
|
@ -0,0 +1,541 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from mealie.services.scraper import cleaner
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CleanerCase:
|
||||
test_id: str
|
||||
input: Any
|
||||
expected: Any
|
||||
exception: Any = None
|
||||
|
||||
|
||||
clean_string_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty_string",
|
||||
input="",
|
||||
expected="",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="html",
|
||||
input="<p> Hello World </p>",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="no_change",
|
||||
input="Hello World",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="html_with_extra_closing_tag",
|
||||
input="<p> Hello World </p></p>",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="multiple_spaces",
|
||||
input="Hello World",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="tabs",
|
||||
input="\tHello World\t",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="nbsp",
|
||||
input="\xa0Hello World\xa0",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list",
|
||||
input=["Hello World", "Goodbye World"],
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="int",
|
||||
input=1,
|
||||
expected="1",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", clean_string_test_cases, ids=(x.test_id for x in clean_string_test_cases))
|
||||
def test_cleaner_clean_string(case: CleanerCase) -> None:
|
||||
assert case.expected == cleaner.clean_string(case.input)
|
||||
|
||||
|
||||
image_cleaner_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty_string",
|
||||
input="",
|
||||
expected="no image",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="no_change",
|
||||
input="https://example.com/image.jpg",
|
||||
expected="https://example.com/image.jpg",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with url key",
|
||||
input={"url": "https://example.com/image.jpg"},
|
||||
expected="https://example.com/image.jpg",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of strings",
|
||||
input=["https://example.com/image.jpg"],
|
||||
expected="https://example.com/image.jpg",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", image_cleaner_test_cases, ids=(x.test_id for x in image_cleaner_test_cases))
|
||||
def test_cleaner_image_cleaner(case: CleanerCase):
|
||||
result = cleaner.clean_image(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
instruction_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="single string",
|
||||
input="Instruction A\nInstruction B\nInstruction C",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="single string multiple newlines",
|
||||
input="Instruction A\n\nInstruction B\n\nInstruction C",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="common list of dicts",
|
||||
input=[
|
||||
{"text": "Instruction A"},
|
||||
{"text": "Instruction B"},
|
||||
{"text": "Instruction C"},
|
||||
],
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with int keys",
|
||||
input={
|
||||
0: {"text": "Instruction A"},
|
||||
1: {"text": "Instruction B"},
|
||||
2: {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with str num keys",
|
||||
input={
|
||||
"0": {"text": "Instruction A"},
|
||||
"1": {"text": "Instruction B"},
|
||||
"2": {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with str num keys",
|
||||
input={
|
||||
"1": {"text": "Instruction A"},
|
||||
"2": {"text": "Instruction B"},
|
||||
"3": {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with str num keys",
|
||||
input={
|
||||
1: {"text": "Instruction A"},
|
||||
2: {"text": "Instruction B"},
|
||||
3: {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="raw json str",
|
||||
input='{"0": {"text": "Instruction A"}, "1": {"text": "Instruction B"}, "2": {"text": "Instruction C"}}',
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="how to steps",
|
||||
input=[
|
||||
{
|
||||
"@type": "HowToSection",
|
||||
"itemListElement": [
|
||||
{
|
||||
"@type": "HowToStep",
|
||||
"text": "Instruction A",
|
||||
},
|
||||
{
|
||||
"@type": "HowToStep",
|
||||
"text": "Instruction B",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"@type": "HowToSection",
|
||||
"itemListElement": [
|
||||
{
|
||||
"@type": "HowToStep",
|
||||
"text": "Instruction C",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (1)",
|
||||
input="Instruction A\n\nInstruction B\n\nInstruction C\n\n",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (2)",
|
||||
input="Instruction A\nInstruction B\nInstruction C\n",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (3)",
|
||||
input="Instruction A\r\n\r\nInstruction B\r\n\r\nInstruction C\r\n\r\n",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (4)",
|
||||
input="Instruction A\r\nInstruction B\r\nInstruction C\r\n",
|
||||
expected=None,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("instructions", instruction_test_cases, ids=(x.test_id for x in instruction_test_cases))
|
||||
def test_cleaner_instructions(instructions: CleanerCase):
|
||||
reuslt = cleaner.clean_instructions(instructions.input)
|
||||
|
||||
expected = [
|
||||
{"text": "Instruction A"},
|
||||
{"text": "Instruction B"},
|
||||
{"text": "Instruction C"},
|
||||
]
|
||||
|
||||
assert reuslt == expected
|
||||
|
||||
|
||||
ingredients_test_cases = (
|
||||
CleanerCase(
|
||||
input="",
|
||||
expected=[],
|
||||
test_id="empty string",
|
||||
),
|
||||
CleanerCase(
|
||||
input="1 cup of flour",
|
||||
expected=["1 cup of flour"],
|
||||
test_id="single ingredient string",
|
||||
),
|
||||
CleanerCase(
|
||||
input=["1 cup of flour"],
|
||||
expected=["1 cup of flour"],
|
||||
test_id="single ingredient list",
|
||||
),
|
||||
CleanerCase(
|
||||
input=["1 cup of flour", "1 cup of sugar"],
|
||||
expected=["1 cup of flour", "1 cup of sugar"],
|
||||
test_id="multiple ingredient list",
|
||||
),
|
||||
CleanerCase(
|
||||
input={"0": "1 cup of flour", "1": "1 cup of sugar"},
|
||||
expected=None,
|
||||
test_id="multiple ingredient dictionary",
|
||||
exception=TypeError,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ingredients", ingredients_test_cases, ids=(x.test_id for x in ingredients_test_cases))
|
||||
def test_cleaner_clean_ingredients(ingredients: CleanerCase):
|
||||
|
||||
if ingredients.exception:
|
||||
with pytest.raises(ingredients.exception):
|
||||
cleaner.clean_ingredients(ingredients.input)
|
||||
|
||||
return
|
||||
|
||||
assert ingredients.expected == cleaner.clean_ingredients(ingredients.input)
|
||||
|
||||
|
||||
yield_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected="",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of strings",
|
||||
input=["Makes 4 Batches", "4 Batches"],
|
||||
expected="4 Batches",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="basic string",
|
||||
input="Makes 4 Batches",
|
||||
expected="Makes 4 Batches",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="empty list",
|
||||
input=[],
|
||||
expected="",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", yield_test_cases, ids=(x.test_id for x in yield_test_cases))
|
||||
def test_cleaner_clean_yield_amount(case: CleanerCase):
|
||||
result = cleaner.clean_yield(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
time_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="emtpy whitespace",
|
||||
input=" ",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="none",
|
||||
input=None,
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="invalid string",
|
||||
input="invalid",
|
||||
expected="invalid",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta",
|
||||
input=timedelta(minutes=30),
|
||||
expected="30 Minutes",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (1)",
|
||||
input="PT2H30M",
|
||||
expected="2 Hours 30 Minutes",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (2)",
|
||||
input="PT30M",
|
||||
expected="30 Minutes",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (3)",
|
||||
input="PT2H",
|
||||
expected="2 Hours",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (4)",
|
||||
input="P1DT1H1M1S",
|
||||
expected="1 day 1 Hour 1 Minute 1 Second",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (4)",
|
||||
input="P1DT1H1M1.53S",
|
||||
expected="1 day 1 Hour 1 Minute 1 Second",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (5) invalid",
|
||||
input="PT",
|
||||
expected="none",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (6) PT-3H",
|
||||
input="PT-3H",
|
||||
expected="PT-3H",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", time_test_cases, ids=(x.test_id for x in time_test_cases))
|
||||
def test_cleaner_clean_time(case: CleanerCase):
|
||||
result = cleaner.clean_time(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
category_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="emtpy whitespace",
|
||||
input=" ",
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="emtpy list",
|
||||
input=[],
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="single string",
|
||||
input="Dessert",
|
||||
expected=["Dessert"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="nested dictionary",
|
||||
input=[
|
||||
{"name": "Dessert", "slug": "dessert"},
|
||||
{"name": "Lunch", "slug": "lunch"},
|
||||
],
|
||||
expected=["Dessert", "Lunch"],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", category_test_cases, ids=(x.test_id for x in category_test_cases))
|
||||
def test_cleaner_clean_categories(case: CleanerCase):
|
||||
result = cleaner.clean_categories(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
tag_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="single tag",
|
||||
input="tag",
|
||||
expected=["Tag"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="comma separated tags",
|
||||
input="tag1, tag2, tag3",
|
||||
expected=["Tag1", "Tag2", "Tag3"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of tags",
|
||||
input=["tag1", "tag2", "tag3"],
|
||||
expected=["Tag1", "Tag2", "Tag3"],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", tag_test_cases, ids=(x.test_id for x in tag_test_cases))
|
||||
def test_cleaner_clean_tags(case: CleanerCase):
|
||||
result = cleaner.clean_tags(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
nutrition_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty dict",
|
||||
input={},
|
||||
expected={},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="valid kets",
|
||||
input={
|
||||
"calories": "100mg",
|
||||
"fatContent": "10",
|
||||
},
|
||||
expected={
|
||||
"calories": "100",
|
||||
"fatContent": "10",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="invalid keys get removed",
|
||||
input={
|
||||
"calories": "100mg",
|
||||
"fatContent": "10",
|
||||
"invalid": "invalid",
|
||||
},
|
||||
expected={
|
||||
"calories": "100",
|
||||
"fatContent": "10",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="support `,` seperated numbers instead of `.` (common in Europe)",
|
||||
input={
|
||||
"calories": "100,000mg",
|
||||
"fatContent": "10,000",
|
||||
},
|
||||
expected={
|
||||
"calories": "100.000",
|
||||
"fatContent": "10.000",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="special support for sodiumContent (g -> mg)",
|
||||
input={
|
||||
"sodiumContent": "10g",
|
||||
},
|
||||
expected={
|
||||
"sodiumContent": "10000.0",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="special support for sodiumContent (mg -> mg)",
|
||||
input={
|
||||
"sodiumContent": "10000mg",
|
||||
},
|
||||
expected={
|
||||
"sodiumContent": "10000",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="strip units",
|
||||
input={
|
||||
"calories": "100 kcal",
|
||||
},
|
||||
expected={
|
||||
"calories": "100",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list as value continues after first value",
|
||||
input={
|
||||
"calories": ["100 kcal"],
|
||||
"sugarContent": "but still tries 555.321",
|
||||
},
|
||||
expected={
|
||||
"sugarContent": "555.321",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="multiple decimals",
|
||||
input={
|
||||
"sodiumContent": "10.1.2g",
|
||||
},
|
||||
expected={
|
||||
"sodiumContent": "10100.0",
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", nutrition_test_cases, ids=(x.test_id for x in nutrition_test_cases))
|
||||
def test_cleaner_clean_nutrition(case: CleanerCase):
|
||||
result = cleaner.clean_nutrition(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"t,max_components,max_decimal_places,expected",
|
||||
[
|
||||
(timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"),
|
||||
(timedelta(days=2, seconds=17280), 1, 2, "2.2 days"),
|
||||
(timedelta(days=365), None, 2, "1 year"),
|
||||
],
|
||||
)
|
||||
def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected):
|
||||
assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected
|
@ -1,140 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from mealie.services.scraper import cleaner
|
||||
from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph
|
||||
from tests import data as test_data
|
||||
|
||||
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
|
||||
url_validation_regex = re.compile(
|
||||
r"^(?:http|ftp)s?://" # http:// or https://
|
||||
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
|
||||
r"localhost|" # localhost...
|
||||
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
|
||||
r"(?::\d+)?" # optional port
|
||||
r"(?:/?|[/?]\S+)$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
test_cleaner_data = [
|
||||
(test_data.json_best_homemade_salsa_recipe, 2),
|
||||
(test_data.json_blue_cheese_stuffed_turkey_meatballs_with_raspberry_balsamic_glaze_2, 3),
|
||||
(test_data.json_bon_appetit, 8),
|
||||
(test_data.json_chunky_apple_cake, 4),
|
||||
(test_data.json_dairy_free_impossible_pumpkin_pie, 7),
|
||||
(test_data.json_how_to_make_instant_pot_spaghetti, 8),
|
||||
(test_data.json_instant_pot_chicken_and_potatoes, 4),
|
||||
(test_data.json_instant_pot_kerala_vegetable_stew, 13),
|
||||
(test_data.json_jalapeno_popper_dip, 4),
|
||||
(test_data.json_microwave_sweet_potatoes_04783, 4),
|
||||
(test_data.json_moroccan_skirt_steak_with_roasted_pepper_couscous, 4),
|
||||
(test_data.json_pizza_knoblauch_champignon_paprika_vegan_html, 3),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"json_file,num_steps",
|
||||
test_cleaner_data,
|
||||
)
|
||||
def test_cleaner_clean(json_file: Path, num_steps):
|
||||
recipe_data = cleaner.clean(json.loads(json_file.read_text()))
|
||||
assert len(recipe_data["recipeInstructions"]) == num_steps
|
||||
|
||||
|
||||
def test_clean_category():
|
||||
assert cleaner.category("my-category") == ["my-category"]
|
||||
|
||||
|
||||
def test_clean_string():
|
||||
assert cleaner.clean_string("<div>Hello World</div>") == "Hello World"
|
||||
|
||||
|
||||
def test_clean_image():
|
||||
assert cleaner.image(None) == "no image"
|
||||
assert cleaner.image("https://my.image/path/") == "https://my.image/path/"
|
||||
assert cleaner.image({"url": "My URL!"}) == "My URL!"
|
||||
assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"nutrition,expected",
|
||||
[
|
||||
(None, {}),
|
||||
({"calories": "105 kcal"}, {"calories": "105"}),
|
||||
({"calories": "105 kcal 104 sugar"}, {"calories": "105"}),
|
||||
({"calories": ""}, {}),
|
||||
({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}),
|
||||
({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}),
|
||||
({"sodiumContent": "5mg"}, {"sodiumContent": "5"}),
|
||||
({"sodiumContent": "10oz"}, {"sodiumContent": "10"}),
|
||||
({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}),
|
||||
],
|
||||
)
|
||||
def test_clean_nutrition(nutrition, expected):
|
||||
assert cleaner.clean_nutrition(nutrition) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"instructions",
|
||||
[
|
||||
"A\n\nB\n\nC\n\n",
|
||||
"A\nB\nC\n",
|
||||
"A\r\n\r\nB\r\n\r\nC\r\n\r\n",
|
||||
"A\r\nB\r\nC\r\n",
|
||||
["A", "B", "C"],
|
||||
[{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]],
|
||||
],
|
||||
)
|
||||
def test_cleaner_instructions(instructions):
|
||||
assert cleaner.instructions(instructions) == [
|
||||
{"text": "A"},
|
||||
{"text": "B"},
|
||||
{"text": "C"},
|
||||
]
|
||||
|
||||
|
||||
def test_html_with_recipe_data():
|
||||
path = test_data.html_healthy_pasta_bake_60759
|
||||
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
|
||||
|
||||
open_graph_strategy = RecipeScraperOpenGraph(url)
|
||||
|
||||
recipe_data = open_graph_strategy.get_recipe_fields(path.read_text())
|
||||
|
||||
assert len(recipe_data["name"]) > 10
|
||||
assert len(recipe_data["slug"]) > 10
|
||||
assert recipe_data["orgURL"] == url
|
||||
assert len(recipe_data["description"]) > 100
|
||||
assert url_validation_regex.match(recipe_data["image"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"time_delta,expected",
|
||||
[
|
||||
("PT2H30M", "2 Hours 30 Minutes"),
|
||||
("PT30M", "30 Minutes"),
|
||||
("PT3H", "3 Hours"),
|
||||
("P1DT1H1M1S", "1 day 1 Hour 1 Minute 1 Second"),
|
||||
("P1DT1H1M1.53S", "1 day 1 Hour 1 Minute 1 Second"),
|
||||
("PT-3H", "PT-3H"),
|
||||
("PT", "none"),
|
||||
],
|
||||
)
|
||||
def test_time_cleaner(time_delta, expected):
|
||||
assert cleaner.clean_time(time_delta) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"t,max_components,max_decimal_places,expected",
|
||||
[
|
||||
(timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"),
|
||||
(timedelta(days=2, seconds=17280), 1, 2, "2.2 days"),
|
||||
(timedelta(days=365), None, 2, "1 year"),
|
||||
],
|
||||
)
|
||||
def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected):
|
||||
assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected
|
Loading…
x
Reference in New Issue
Block a user