mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-05-24 01:12:54 -04:00
fix: recipe scraper image cleaning (#2139)
* updated image cleaner enabled image cleaner added case for nested image dicts * refactored image cleaner to return a list of urls
This commit is contained in:
parent
53fe5921d2
commit
05e2566c35
@ -47,7 +47,7 @@ def clean(recipe_data: dict, url=None) -> dict:
|
||||
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
|
||||
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
|
||||
recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
|
||||
recipe_data["image"] = clean_image(recipe_data.get("image"))
|
||||
recipe_data["image"] = clean_image(recipe_data.get("image"))[0]
|
||||
recipe_data["slug"] = slugify(recipe_data.get("name", ""))
|
||||
recipe_data["orgURL"] = url
|
||||
|
||||
@ -77,31 +77,34 @@ def clean_string(text: str | list | int) -> str:
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def clean_image(image: str | list | dict | None = None, default="no image") -> str:
|
||||
def clean_image(image: str | list | dict | None = None, default: str = "no image") -> list[str]:
|
||||
"""
|
||||
image attempts to parse the image field from a recipe and return a string. Currenty
|
||||
|
||||
Supported Structures:
|
||||
- `["https://exmaple.com"]` - A list of strings
|
||||
- `https://exmaple.com` - A string
|
||||
- `{ "url": "https://exmaple.com"` - A dictionary with a `url` key
|
||||
- `{ "url": "https://exmaple.com" }` - A dictionary with a `url` key
|
||||
- `["https://exmaple.com"]` - A list of strings
|
||||
- `[{ "url": "https://exmaple.com" }]` - A list of dictionaries with a `url` key
|
||||
|
||||
Raises:
|
||||
TypeError: If the image field is not a supported type a TypeError is raised.
|
||||
|
||||
Returns:
|
||||
str: "no image" if any empty string is provided or the url of the image
|
||||
list[str]: list of urls, or [default] if input is empty
|
||||
"""
|
||||
if not image:
|
||||
return default
|
||||
return [default]
|
||||
|
||||
match image: # noqa - match statement not supported
|
||||
match image:
|
||||
case str(image):
|
||||
return [image]
|
||||
case [str(_), *_]:
|
||||
return image
|
||||
case list(image):
|
||||
return image[0]
|
||||
case [{"url": str(_)}, *_]:
|
||||
return [x["url"] for x in image]
|
||||
case {"url": str(image)}:
|
||||
return image
|
||||
return [image]
|
||||
case _:
|
||||
raise TypeError(f"Unexpected type for image: {type(image)}, {image}")
|
||||
|
||||
|
@ -150,7 +150,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
||||
recipe = Recipe(
|
||||
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
|
||||
slug="",
|
||||
image=try_get_default(None, "image", None),
|
||||
image=try_get_default(None, "image", None, cleaner.clean_image),
|
||||
description=try_get_default(None, "description", "", cleaner.clean_string),
|
||||
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
|
||||
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
|
||||
|
@ -73,22 +73,27 @@ image_cleaner_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty_string",
|
||||
input="",
|
||||
expected="no image",
|
||||
expected=["no image"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="no_change",
|
||||
input="https://example.com/image.jpg",
|
||||
expected="https://example.com/image.jpg",
|
||||
expected=["https://example.com/image.jpg"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with url key",
|
||||
input={"url": "https://example.com/image.jpg"},
|
||||
expected="https://example.com/image.jpg",
|
||||
expected=["https://example.com/image.jpg"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of strings",
|
||||
input=["https://example.com/image.jpg"],
|
||||
expected="https://example.com/image.jpg",
|
||||
expected=["https://example.com/image.jpg"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of dicts with url key",
|
||||
input=[{"url": "https://example.com/image.jpg"}],
|
||||
expected=["https://example.com/image.jpg"],
|
||||
),
|
||||
)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user