fix: recipe scraper image cleaning (#2139)

* updated image cleaner enabled image cleaner added case for nested image dicts * refactored image cleaner to return a list of urls
2025-07-09 03:04:54 -04:00 · 2023-02-19 18:43:52 -06:00 · 2023-02-19 18:43:52 -06:00 · 05e2566c35
commit 05e2566c35
parent 53fe5921d2
3 changed files with 23 additions and 15 deletions
--- a/mealie/services/scraper/cleaner.py
+++ b/mealie/services/scraper/cleaner.py
@ -47,7 +47,7 @@ def clean(recipe_data: dict, url=None) -> dict:
    recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
    recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
    recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
-    recipe_data["image"] = clean_image(recipe_data.get("image"))
+    recipe_data["image"] = clean_image(recipe_data.get("image"))[0]
    recipe_data["slug"] = slugify(recipe_data.get("name", ""))
    recipe_data["orgURL"] = url

@ -77,31 +77,34 @@ def clean_string(text: str | list | int) -> str:
    return cleaned_text


-def clean_image(image: str | list | dict | None = None, default="no image") -> str:
+def clean_image(image: str | list | dict | None = None, default: str = "no image") -> list[str]:
    """
    image attempts to parse the image field from a recipe and return a string. Currenty

    Supported Structures:
-        - `["https://exmaple.com"]` - A list of strings
        - `https://exmaple.com` - A string
-        - `{ "url": "https://exmaple.com"` - A dictionary with a `url` key
+        - `{ "url": "https://exmaple.com" }` - A dictionary with a `url` key
+        - `["https://exmaple.com"]` - A list of strings
+        - `[{ "url": "https://exmaple.com" }]` - A list of dictionaries with a `url` key

    Raises:
        TypeError: If the image field is not a supported type a TypeError is raised.

    Returns:
-        str: "no image" if any empty string is provided or the url of the image
+        list[str]: list of urls, or [default] if input is empty
    """
    if not image:
-        return default
+        return [default]

-    match image:  # noqa - match statement not supported
+    match image:
        case str(image):
+            return [image]
+        case [str(_), *_]:
            return image
-        case list(image):
-            return image[0]
+        case [{"url": str(_)}, *_]:
+            return [x["url"] for x in image]
        case {"url": str(image)}:
-            return image
+            return [image]
        case _:
            raise TypeError(f"Unexpected type for image: {type(image)}, {image}")

--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@ -150,7 +150,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
        recipe = Recipe(
            name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
            slug="",
-            image=try_get_default(None, "image", None),
+            image=try_get_default(None, "image", None, cleaner.clean_image),
            description=try_get_default(None, "description", "", cleaner.clean_string),
            nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
            recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
--- a/tests/unit_tests/services_tests/scraper_tests/test_cleaner_parts.py
+++ b/tests/unit_tests/services_tests/scraper_tests/test_cleaner_parts.py
@ -73,22 +73,27 @@ image_cleaner_test_cases = (
    CleanerCase(
        test_id="empty_string",
        input="",
-        expected="no image",
+        expected=["no image"],
    ),
    CleanerCase(
        test_id="no_change",
        input="https://example.com/image.jpg",
-        expected="https://example.com/image.jpg",
+        expected=["https://example.com/image.jpg"],
    ),
    CleanerCase(
        test_id="dict with url key",
        input={"url": "https://example.com/image.jpg"},
-        expected="https://example.com/image.jpg",
+        expected=["https://example.com/image.jpg"],
    ),
    CleanerCase(
        test_id="list of strings",
        input=["https://example.com/image.jpg"],
-        expected="https://example.com/image.jpg",
+        expected=["https://example.com/image.jpg"],
+    ),
+    CleanerCase(
+        test_id="list of dicts with url key",
+        input=[{"url": "https://example.com/image.jpg"}],
+        expected=["https://example.com/image.jpg"],
    ),
 )