From f8ad72ec3147320c191b1bb1284f06e345a708b0 Mon Sep 17 00:00:00 2001 From: Michael Genson <71845777+michael-genson@users.noreply.github.com> Date: Thu, 7 Dec 2023 11:08:47 -0600 Subject: [PATCH] fix: Bulk URL Import Fixes (#2796) * allow expections when fetching content * removed extra bracket on import text * added more fault tolerance and limited concurrency * fix entries not being saved to report * disable clicking into in-proress import * conditionally render expansion --- frontend/components/global/ReportTable.vue | 4 ++ frontend/pages/group/data/recipes.vue | 2 +- frontend/pages/group/reports/_id.vue | 2 +- mealie/services/recipe/recipe_data_service.py | 9 +++-- .../services/scraper/recipe_bulk_scraper.py | 37 ++++++++++++------- mealie/services/scraper/scraper_strategies.py | 2 +- 6 files changed, 37 insertions(+), 19 deletions(-) diff --git a/frontend/components/global/ReportTable.vue b/frontend/components/global/ReportTable.vue index a4eaabdf3488..aede7b1c4b71 100644 --- a/frontend/components/global/ReportTable.vue +++ b/frontend/components/global/ReportTable.vue @@ -49,6 +49,10 @@ export default defineComponent({ ]; function handleRowClick(item: ReportSummary) { + if (item.status === "in-progress") { + return; + } + router.push(`/group/reports/${item.id}`); } diff --git a/frontend/pages/group/data/recipes.vue b/frontend/pages/group/data/recipes.vue index 0895620bf3a2..0803f285badc 100644 --- a/frontend/pages/group/data/recipes.vue +++ b/frontend/pages/group/data/recipes.vue @@ -121,7 +121,7 @@ - {{ $t('general.import') }}} + {{ $t('general.import') }} diff --git a/mealie/services/recipe/recipe_data_service.py b/mealie/services/recipe/recipe_data_service.py index 59d42feb9b44..1f5bc759fab7 100644 --- a/mealie/services/recipe/recipe_data_service.py +++ b/mealie/services/recipe/recipe_data_service.py @@ -12,14 +12,17 @@ from mealie.services._base_service import BaseService _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" -async def gather_with_concurrency(n, *coros): +async def gather_with_concurrency(n, *coros, ignore_exceptions=False): semaphore = asyncio.Semaphore(n) async def sem_coro(coro): async with semaphore: return await coro - return await asyncio.gather(*(sem_coro(c) for c in coros)) + results = await asyncio.gather(*(sem_coro(c) for c in coros), return_exceptions=ignore_exceptions) + if ignore_exceptions: + results = [r for r in results if not isinstance(r, Exception)] + return results async def largest_content_len(urls: list[str]) -> tuple[str, int]: @@ -31,7 +34,7 @@ async def largest_content_len(urls: list[str]) -> tuple[str, int]: async with AsyncClient() as client: tasks = [do(client, url) for url in urls] - responses: list[Response] = await gather_with_concurrency(10, *tasks) + responses: list[Response] = await gather_with_concurrency(10, *tasks, ignore_exceptions=True) for response in responses: len_int = int(response.headers.get("Content-Length", 0)) if len_int > largest_len: diff --git a/mealie/services/scraper/recipe_bulk_scraper.py b/mealie/services/scraper/recipe_bulk_scraper.py index 947e8b688d64..ed701ecc3886 100644 --- a/mealie/services/scraper/recipe_bulk_scraper.py +++ b/mealie/services/scraper/recipe_bulk_scraper.py @@ -1,10 +1,16 @@ -from asyncio import gather +import asyncio from pydantic import UUID4 from mealie.repos.repository_factory import AllRepositories from mealie.schema.recipe.recipe import CreateRecipeByUrlBulk, Recipe -from mealie.schema.reports.reports import ReportCategory, ReportCreate, ReportEntryCreate, ReportSummaryStatus +from mealie.schema.reports.reports import ( + ReportCategory, + ReportCreate, + ReportEntryCreate, + ReportEntryOut, + ReportSummaryStatus, +) from mealie.schema.user.user import GroupInDB from mealie.services._base_service import BaseService from mealie.services.recipe.recipe_service import RecipeService @@ -47,6 +53,7 @@ class RecipeBulkScraperService(BaseService): is_success = True is_failure = True + new_entries: list[ReportEntryOut] = [] for entry in self.report_entries: if is_failure and entry.success: is_failure = False @@ -54,7 +61,7 @@ class RecipeBulkScraperService(BaseService): if is_success and not entry.success: is_success = False - self.repos.group_report_entries.create(entry) + new_entries.append(self.repos.group_report_entries.create(entry)) if is_success: self.report.status = ReportSummaryStatus.success @@ -65,25 +72,29 @@ class RecipeBulkScraperService(BaseService): if not is_success and not is_failure: self.report.status = ReportSummaryStatus.partial + self.report.entries = new_entries self.repos.group_reports.update(self.report.id, self.report) async def scrape(self, urls: CreateRecipeByUrlBulk) -> None: + sem = asyncio.Semaphore(3) + async def _do(url: str) -> Recipe | None: - try: - recipe, _ = await create_from_url(url) - return recipe - except Exception as e: - self.service.logger.error(f"failed to scrape url during bulk url import {b.url}") - self.service.logger.exception(e) - self._add_error_entry(f"failed to scrape url {url}", str(e)) - return None + async with sem: + try: + recipe, _ = await create_from_url(url) + return recipe + except Exception as e: + self.service.logger.error(f"failed to scrape url during bulk url import {url}") + self.service.logger.exception(e) + self._add_error_entry(f"failed to scrape url {url}", str(e)) + return None if self.report is None: self.get_report_id() tasks = [_do(b.url) for b in urls.imports] - results = await gather(*tasks) + results = await asyncio.gather(*tasks, return_exceptions=True) for b, recipe in zip(urls.imports, results, strict=True): - if not recipe: + if not recipe or isinstance(recipe, Exception): continue if b.tags: diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py index cd2435192366..fd51498023a4 100644 --- a/mealie/services/scraper/scraper_strategies.py +++ b/mealie/services/scraper/scraper_strategies.py @@ -172,7 +172,7 @@ class RecipeScraperPackage(ABCScraperStrategy): try: scraped_schema = scrape_html(recipe_html, org_url=self.url) except (NoSchemaFoundInWildMode, AttributeError): - self.logger.error("Recipe Scraper was unable to extract a recipe.") + self.logger.error(f"Recipe Scraper was unable to extract a recipe from {self.url}") return None except ConnectionError as e: