mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-06-04 06:04:53 -04:00
fix: recipe scraper image cleaning (#2139)
* updated image cleaner enabled image cleaner added case for nested image dicts * refactored image cleaner to return a list of urls
This commit is contained in:
parent
53fe5921d2
commit
05e2566c35
@ -47,7 +47,7 @@ def clean(recipe_data: dict, url=None) -> dict:
|
|||||||
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
|
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
|
||||||
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
|
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
|
||||||
recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
|
recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
|
||||||
recipe_data["image"] = clean_image(recipe_data.get("image"))
|
recipe_data["image"] = clean_image(recipe_data.get("image"))[0]
|
||||||
recipe_data["slug"] = slugify(recipe_data.get("name", ""))
|
recipe_data["slug"] = slugify(recipe_data.get("name", ""))
|
||||||
recipe_data["orgURL"] = url
|
recipe_data["orgURL"] = url
|
||||||
|
|
||||||
@ -77,31 +77,34 @@ def clean_string(text: str | list | int) -> str:
|
|||||||
return cleaned_text
|
return cleaned_text
|
||||||
|
|
||||||
|
|
||||||
def clean_image(image: str | list | dict | None = None, default="no image") -> str:
|
def clean_image(image: str | list | dict | None = None, default: str = "no image") -> list[str]:
|
||||||
"""
|
"""
|
||||||
image attempts to parse the image field from a recipe and return a string. Currenty
|
image attempts to parse the image field from a recipe and return a string. Currenty
|
||||||
|
|
||||||
Supported Structures:
|
Supported Structures:
|
||||||
- `["https://exmaple.com"]` - A list of strings
|
|
||||||
- `https://exmaple.com` - A string
|
- `https://exmaple.com` - A string
|
||||||
- `{ "url": "https://exmaple.com"` - A dictionary with a `url` key
|
- `{ "url": "https://exmaple.com" }` - A dictionary with a `url` key
|
||||||
|
- `["https://exmaple.com"]` - A list of strings
|
||||||
|
- `[{ "url": "https://exmaple.com" }]` - A list of dictionaries with a `url` key
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
TypeError: If the image field is not a supported type a TypeError is raised.
|
TypeError: If the image field is not a supported type a TypeError is raised.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: "no image" if any empty string is provided or the url of the image
|
list[str]: list of urls, or [default] if input is empty
|
||||||
"""
|
"""
|
||||||
if not image:
|
if not image:
|
||||||
return default
|
return [default]
|
||||||
|
|
||||||
match image: # noqa - match statement not supported
|
match image:
|
||||||
case str(image):
|
case str(image):
|
||||||
|
return [image]
|
||||||
|
case [str(_), *_]:
|
||||||
return image
|
return image
|
||||||
case list(image):
|
case [{"url": str(_)}, *_]:
|
||||||
return image[0]
|
return [x["url"] for x in image]
|
||||||
case {"url": str(image)}:
|
case {"url": str(image)}:
|
||||||
return image
|
return [image]
|
||||||
case _:
|
case _:
|
||||||
raise TypeError(f"Unexpected type for image: {type(image)}, {image}")
|
raise TypeError(f"Unexpected type for image: {type(image)}, {image}")
|
||||||
|
|
||||||
|
@ -150,7 +150,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
|||||||
recipe = Recipe(
|
recipe = Recipe(
|
||||||
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
|
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
|
||||||
slug="",
|
slug="",
|
||||||
image=try_get_default(None, "image", None),
|
image=try_get_default(None, "image", None, cleaner.clean_image),
|
||||||
description=try_get_default(None, "description", "", cleaner.clean_string),
|
description=try_get_default(None, "description", "", cleaner.clean_string),
|
||||||
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
|
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
|
||||||
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
|
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
|
||||||
|
@ -73,22 +73,27 @@ image_cleaner_test_cases = (
|
|||||||
CleanerCase(
|
CleanerCase(
|
||||||
test_id="empty_string",
|
test_id="empty_string",
|
||||||
input="",
|
input="",
|
||||||
expected="no image",
|
expected=["no image"],
|
||||||
),
|
),
|
||||||
CleanerCase(
|
CleanerCase(
|
||||||
test_id="no_change",
|
test_id="no_change",
|
||||||
input="https://example.com/image.jpg",
|
input="https://example.com/image.jpg",
|
||||||
expected="https://example.com/image.jpg",
|
expected=["https://example.com/image.jpg"],
|
||||||
),
|
),
|
||||||
CleanerCase(
|
CleanerCase(
|
||||||
test_id="dict with url key",
|
test_id="dict with url key",
|
||||||
input={"url": "https://example.com/image.jpg"},
|
input={"url": "https://example.com/image.jpg"},
|
||||||
expected="https://example.com/image.jpg",
|
expected=["https://example.com/image.jpg"],
|
||||||
),
|
),
|
||||||
CleanerCase(
|
CleanerCase(
|
||||||
test_id="list of strings",
|
test_id="list of strings",
|
||||||
input=["https://example.com/image.jpg"],
|
input=["https://example.com/image.jpg"],
|
||||||
expected="https://example.com/image.jpg",
|
expected=["https://example.com/image.jpg"],
|
||||||
|
),
|
||||||
|
CleanerCase(
|
||||||
|
test_id="list of dicts with url key",
|
||||||
|
input=[{"url": "https://example.com/image.jpg"}],
|
||||||
|
expected=["https://example.com/image.jpg"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user