fix: recipe scraper image cleaning (#2139)

* updated image cleaner
enabled image cleaner
added case for nested image dicts

* refactored image cleaner to return a list of urls
This commit is contained in:
Michael Genson 2023-02-19 18:43:52 -06:00 committed by GitHub
parent 53fe5921d2
commit 05e2566c35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 15 deletions

View File

@ -47,7 +47,7 @@ def clean(recipe_data: dict, url=None) -> dict:
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield")) recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", [])) recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", [])) recipe_data["recipeInstructions"] = clean_instructions(recipe_data.get("recipeInstructions", []))
recipe_data["image"] = clean_image(recipe_data.get("image")) recipe_data["image"] = clean_image(recipe_data.get("image"))[0]
recipe_data["slug"] = slugify(recipe_data.get("name", "")) recipe_data["slug"] = slugify(recipe_data.get("name", ""))
recipe_data["orgURL"] = url recipe_data["orgURL"] = url
@ -77,31 +77,34 @@ def clean_string(text: str | list | int) -> str:
return cleaned_text return cleaned_text
def clean_image(image: str | list | dict | None = None, default="no image") -> str: def clean_image(image: str | list | dict | None = None, default: str = "no image") -> list[str]:
""" """
image attempts to parse the image field from a recipe and return a string. Currenty image attempts to parse the image field from a recipe and return a string. Currenty
Supported Structures: Supported Structures:
- `["https://exmaple.com"]` - A list of strings
- `https://exmaple.com` - A string - `https://exmaple.com` - A string
- `{ "url": "https://exmaple.com"` - A dictionary with a `url` key - `{ "url": "https://exmaple.com" }` - A dictionary with a `url` key
- `["https://exmaple.com"]` - A list of strings
- `[{ "url": "https://exmaple.com" }]` - A list of dictionaries with a `url` key
Raises: Raises:
TypeError: If the image field is not a supported type a TypeError is raised. TypeError: If the image field is not a supported type a TypeError is raised.
Returns: Returns:
str: "no image" if any empty string is provided or the url of the image list[str]: list of urls, or [default] if input is empty
""" """
if not image: if not image:
return default return [default]
match image: # noqa - match statement not supported match image:
case str(image): case str(image):
return [image]
case [str(_), *_]:
return image return image
case list(image): case [{"url": str(_)}, *_]:
return image[0] return [x["url"] for x in image]
case {"url": str(image)}: case {"url": str(image)}:
return image return [image]
case _: case _:
raise TypeError(f"Unexpected type for image: {type(image)}, {image}") raise TypeError(f"Unexpected type for image: {type(image)}, {image}")

View File

@ -150,7 +150,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
recipe = Recipe( recipe = Recipe(
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string), name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
slug="", slug="",
image=try_get_default(None, "image", None), image=try_get_default(None, "image", None, cleaner.clean_image),
description=try_get_default(None, "description", "", cleaner.clean_string), description=try_get_default(None, "description", "", cleaner.clean_string),
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition), nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),

View File

@ -73,22 +73,27 @@ image_cleaner_test_cases = (
CleanerCase( CleanerCase(
test_id="empty_string", test_id="empty_string",
input="", input="",
expected="no image", expected=["no image"],
), ),
CleanerCase( CleanerCase(
test_id="no_change", test_id="no_change",
input="https://example.com/image.jpg", input="https://example.com/image.jpg",
expected="https://example.com/image.jpg", expected=["https://example.com/image.jpg"],
), ),
CleanerCase( CleanerCase(
test_id="dict with url key", test_id="dict with url key",
input={"url": "https://example.com/image.jpg"}, input={"url": "https://example.com/image.jpg"},
expected="https://example.com/image.jpg", expected=["https://example.com/image.jpg"],
), ),
CleanerCase( CleanerCase(
test_id="list of strings", test_id="list of strings",
input=["https://example.com/image.jpg"], input=["https://example.com/image.jpg"],
expected="https://example.com/image.jpg", expected=["https://example.com/image.jpg"],
),
CleanerCase(
test_id="list of dicts with url key",
input=[{"url": "https://example.com/image.jpg"}],
expected=["https://example.com/image.jpg"],
), ),
) )