diff --git a/mealie/services/scraper/recipe_scraper.py b/mealie/services/scraper/recipe_scraper.py index b8c516372aea..a9faeeccb6fc 100644 --- a/mealie/services/scraper/recipe_scraper.py +++ b/mealie/services/scraper/recipe_scraper.py @@ -3,6 +3,8 @@ from mealie.services.scraper.scraped_extras import ScrapedExtras from .scraper_strategies import ABCScraperStrategy, RecipeScraperOpenGraph, RecipeScraperPackage +DEFAULT_SCRAPER_STRATEGIES: list[type[ABCScraperStrategy]] = [RecipeScraperPackage, RecipeScraperOpenGraph] + class RecipeScraper: """ @@ -14,10 +16,7 @@ class RecipeScraper: def __init__(self, scrapers: list[type[ABCScraperStrategy]] | None = None) -> None: if scrapers is None: - scrapers = [ - RecipeScraperPackage, - RecipeScraperOpenGraph, - ] + scrapers = DEFAULT_SCRAPER_STRATEGIES self.scrapers = scrapers diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py index c0847fe3389e..5e58472a1a6d 100644 --- a/mealie/services/scraper/scraper_strategies.py +++ b/mealie/services/scraper/scraper_strategies.py @@ -81,6 +81,10 @@ class ABCScraperStrategy(ABC): self.logger = get_logger() self.url = url + @abstractmethod + async def get_html(self, url: str) -> str: + ... + @abstractmethod async def parse(self) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]: """Parse a recipe from a web URL. @@ -95,6 +99,9 @@ class ABCScraperStrategy(ABC): class RecipeScraperPackage(ABCScraperStrategy): + async def get_html(self, url: str) -> str: + return await safe_scrape_html(url) + def clean_scraper(self, scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> tuple[Recipe, ScrapedExtras]: def try_get_default(func_call: Callable | None, get_attr: str, default: Any, clean_func=None): value = default @@ -160,7 +167,8 @@ class RecipeScraperPackage(ABCScraperStrategy): return recipe, extras async def scrape_url(self) -> SchemaScraperFactory.SchemaScraper | Any | None: - recipe_html = await safe_scrape_html(self.url) + recipe_html = await self.get_html(self.url) + try: scraped_schema = scrape_html(recipe_html, org_url=self.url) except (NoSchemaFoundInWildMode, AttributeError): @@ -204,8 +212,8 @@ class RecipeScraperOpenGraph(ABCScraperStrategy): Abstract class for all recipe parsers. """ - async def get_html(self) -> str: - return await safe_scrape_html(self.url) + async def get_html(self, url: str) -> str: + return await safe_scrape_html(url) def get_recipe_fields(self, html) -> dict | None: """ @@ -245,7 +253,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy): """ Parse a recipe from a given url. """ - html = await self.get_html() + html = await self.get_html(self.url) og_data = self.get_recipe_fields(html) diff --git a/tests/integration_tests/user_recipe_tests/test_recipe_crud.py b/tests/integration_tests/user_recipe_tests/test_recipe_crud.py index 1689ce73ea57..7244d75cc60b 100644 --- a/tests/integration_tests/user_recipe_tests/test_recipe_crud.py +++ b/tests/integration_tests/user_recipe_tests/test_recipe_crud.py @@ -1,6 +1,5 @@ import json from pathlib import Path -from typing import Optional, Union import pytest from bs4 import BeautifulSoup @@ -12,7 +11,7 @@ from slugify import slugify from mealie.schema.recipe.recipe import RecipeCategory from mealie.services.recipe.recipe_data_service import RecipeDataService -from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph +from mealie.services.scraper.recipe_scraper import DEFAULT_SCRAPER_STRATEGIES from tests import data, utils from tests.utils import api_routes from tests.utils.factories import random_string @@ -31,9 +30,9 @@ def get_init(html_path: Path): def init_override( self, url, - proxies: Optional[str] = None, - timeout: Optional[Union[float, tuple, None]] = None, - wild_mode: Optional[bool] = False, + proxies: str | None = None, + timeout: float | tuple | None = None, + wild_mode: bool | None = False, **_, ): page_data = html_path.read_bytes() @@ -48,7 +47,7 @@ def get_init(html_path: Path): def open_graph_override(html: str): - def get_html(self) -> str: + async def get_html(self, url: str) -> str: return html return get_html @@ -68,11 +67,12 @@ def test_create_by_url( get_init(recipe_data.html_file), ) # Override the get_html method of the RecipeScraperOpenGraph to return the test html - monkeypatch.setattr( - RecipeScraperOpenGraph, - "get_html", - open_graph_override(recipe_data.html_file.read_text()), - ) + for scraper_cls in DEFAULT_SCRAPER_STRATEGIES: + monkeypatch.setattr( + scraper_cls, + "get_html", + open_graph_override(recipe_data.html_file.read_text()), + ) # Skip image downloader monkeypatch.setattr( RecipeDataService, @@ -113,12 +113,13 @@ def test_create_by_url_with_tags( "__init__", get_init(html_file), ) - # Override the get_html method of the RecipeScraperOpenGraph to return the test html - monkeypatch.setattr( - RecipeScraperOpenGraph, - "get_html", - open_graph_override(html_file.read_text()), - ) + # Override the get_html method of all scraper strategies to return the test html + for scraper_cls in DEFAULT_SCRAPER_STRATEGIES: + monkeypatch.setattr( + scraper_cls, + "get_html", + open_graph_override(html_file.read_text()), + ) # Skip image downloader monkeypatch.setattr( RecipeDataService, @@ -198,7 +199,7 @@ def test_read_update( assert len(recipe["recipeCategory"]) == len(recipe_categories) test_name = [x.name for x in recipe_categories] - for cats in zip(recipe["recipeCategory"], recipe_categories): + for cats in zip(recipe["recipeCategory"], recipe_categories, strict=False): assert cats[0]["name"] in test_name