fix: recipe scraper image cleaning (#2139)

* updated image cleaner
enabled image cleaner
added case for nested image dicts

* refactored image cleaner to return a list of urls
This commit is contained in:
Michael Genson 2023-02-19 18:43:52 -06:00 committed by GitHub
parent 53fe5921d2
commit 05e2566c35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 15 deletions

View File

@ -47,7 +47,7 @@ def clean(recipe_data: dict, url=None) -> dict:
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
recipe_data["image"] = clean_image(recipe_data.get("image"))
recipe_data["image"] = clean_image(recipe_data.get("image"))[0]
recipe_data["slug"] = slugify(recipe_data.get("name", ""))
recipe_data["orgURL"] = url
@ -77,31 +77,34 @@ def clean_string(text: str | list | int) -> str:
return cleaned_text
def clean_image(image: str | list | dict | None = None, default="no image") -> str:
def clean_image(image: str | list | dict | None = None, default: str = "no image") -> list[str]:
"""
image attempts to parse the image field from a recipe and return a string. Currenty
Supported Structures:
- `["https://exmaple.com"]` - A list of strings
- `https://exmaple.com` - A string
- `{ "url": "https://exmaple.com"` - A dictionary with a `url` key
- `{ "url": "https://exmaple.com" }` - A dictionary with a `url` key
- `["https://exmaple.com"]` - A list of strings
- `[{ "url": "https://exmaple.com" }]` - A list of dictionaries with a `url` key
Raises:
TypeError: If the image field is not a supported type a TypeError is raised.
Returns:
str: "no image" if any empty string is provided or the url of the image
list[str]: list of urls, or [default] if input is empty
"""
if not image:
return default
return [default]
match image: # noqa - match statement not supported
match image:
case str(image):
return [image]
case [str(_), *_]:
return image
case list(image):
return image[0]
case [{"url": str(_)}, *_]:
return [x["url"] for x in image]
case {"url": str(image)}:
return image
return [image]
case _:
raise TypeError(f"Unexpected type for image: {type(image)}, {image}")

View File

@ -150,7 +150,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
recipe = Recipe(
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
slug="",
image=try_get_default(None, "image", None),
image=try_get_default(None, "image", None, cleaner.clean_image),
description=try_get_default(None, "description", "", cleaner.clean_string),
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),

View File

@ -73,22 +73,27 @@ image_cleaner_test_cases = (
CleanerCase(
test_id="empty_string",
input="",
expected="no image",
expected=["no image"],
),
CleanerCase(
test_id="no_change",
input="https://example.com/image.jpg",
expected="https://example.com/image.jpg",
expected=["https://example.com/image.jpg"],
),
CleanerCase(
test_id="dict with url key",
input={"url": "https://example.com/image.jpg"},
expected="https://example.com/image.jpg",
expected=["https://example.com/image.jpg"],
),
CleanerCase(
test_id="list of strings",
input=["https://example.com/image.jpg"],
expected="https://example.com/image.jpg",
expected=["https://example.com/image.jpg"],
),
CleanerCase(
test_id="list of dicts with url key",
input=[{"url": "https://example.com/image.jpg"}],
expected=["https://example.com/image.jpg"],
),
)