From f8ad72ec3147320c191b1bb1284f06e345a708b0 Mon Sep 17 00:00:00 2001
From: Michael Genson <71845777+michael-genson@users.noreply.github.com>
Date: Thu, 7 Dec 2023 11:08:47 -0600
Subject: [PATCH] fix: Bulk URL Import Fixes (#2796)
* allow expections when fetching content
* removed extra bracket on import text
* added more fault tolerance and limited concurrency
* fix entries not being saved to report
* disable clicking into in-proress import
* conditionally render expansion
---
frontend/components/global/ReportTable.vue | 4 ++
frontend/pages/group/data/recipes.vue | 2 +-
frontend/pages/group/reports/_id.vue | 2 +-
mealie/services/recipe/recipe_data_service.py | 9 +++--
.../services/scraper/recipe_bulk_scraper.py | 37 ++++++++++++-------
mealie/services/scraper/scraper_strategies.py | 2 +-
6 files changed, 37 insertions(+), 19 deletions(-)
diff --git a/frontend/components/global/ReportTable.vue b/frontend/components/global/ReportTable.vue
index a4eaabdf3488..aede7b1c4b71 100644
--- a/frontend/components/global/ReportTable.vue
+++ b/frontend/components/global/ReportTable.vue
@@ -49,6 +49,10 @@ export default defineComponent({
];
function handleRowClick(item: ReportSummary) {
+ if (item.status === "in-progress") {
+ return;
+ }
+
router.push(`/group/reports/${item.id}`);
}
diff --git a/frontend/pages/group/data/recipes.vue b/frontend/pages/group/data/recipes.vue
index 0895620bf3a2..0803f285badc 100644
--- a/frontend/pages/group/data/recipes.vue
+++ b/frontend/pages/group/data/recipes.vue
@@ -121,7 +121,7 @@
{{ $globals.icons.database }}
- {{ $t('general.import') }}}
+ {{ $t('general.import') }}
- {{ item.exception }} |
+ {{ item.exception }} |
diff --git a/mealie/services/recipe/recipe_data_service.py b/mealie/services/recipe/recipe_data_service.py
index 59d42feb9b44..1f5bc759fab7 100644
--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@@ -12,14 +12,17 @@ from mealie.services._base_service import BaseService
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
-async def gather_with_concurrency(n, *coros):
+async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
semaphore = asyncio.Semaphore(n)
async def sem_coro(coro):
async with semaphore:
return await coro
- return await asyncio.gather(*(sem_coro(c) for c in coros))
+ results = await asyncio.gather(*(sem_coro(c) for c in coros), return_exceptions=ignore_exceptions)
+ if ignore_exceptions:
+ results = [r for r in results if not isinstance(r, Exception)]
+ return results
async def largest_content_len(urls: list[str]) -> tuple[str, int]:
@@ -31,7 +34,7 @@ async def largest_content_len(urls: list[str]) -> tuple[str, int]:
async with AsyncClient() as client:
tasks = [do(client, url) for url in urls]
- responses: list[Response] = await gather_with_concurrency(10, *tasks)
+ responses: list[Response] = await gather_with_concurrency(10, *tasks, ignore_exceptions=True)
for response in responses:
len_int = int(response.headers.get("Content-Length", 0))
if len_int > largest_len:
diff --git a/mealie/services/scraper/recipe_bulk_scraper.py b/mealie/services/scraper/recipe_bulk_scraper.py
index 947e8b688d64..ed701ecc3886 100644
--- a/mealie/services/scraper/recipe_bulk_scraper.py
+++ b/mealie/services/scraper/recipe_bulk_scraper.py
@@ -1,10 +1,16 @@
-from asyncio import gather
+import asyncio
from pydantic import UUID4
from mealie.repos.repository_factory import AllRepositories
from mealie.schema.recipe.recipe import CreateRecipeByUrlBulk, Recipe
-from mealie.schema.reports.reports import ReportCategory, ReportCreate, ReportEntryCreate, ReportSummaryStatus
+from mealie.schema.reports.reports import (
+ ReportCategory,
+ ReportCreate,
+ ReportEntryCreate,
+ ReportEntryOut,
+ ReportSummaryStatus,
+)
from mealie.schema.user.user import GroupInDB
from mealie.services._base_service import BaseService
from mealie.services.recipe.recipe_service import RecipeService
@@ -47,6 +53,7 @@ class RecipeBulkScraperService(BaseService):
is_success = True
is_failure = True
+ new_entries: list[ReportEntryOut] = []
for entry in self.report_entries:
if is_failure and entry.success:
is_failure = False
@@ -54,7 +61,7 @@ class RecipeBulkScraperService(BaseService):
if is_success and not entry.success:
is_success = False
- self.repos.group_report_entries.create(entry)
+ new_entries.append(self.repos.group_report_entries.create(entry))
if is_success:
self.report.status = ReportSummaryStatus.success
@@ -65,25 +72,29 @@ class RecipeBulkScraperService(BaseService):
if not is_success and not is_failure:
self.report.status = ReportSummaryStatus.partial
+ self.report.entries = new_entries
self.repos.group_reports.update(self.report.id, self.report)
async def scrape(self, urls: CreateRecipeByUrlBulk) -> None:
+ sem = asyncio.Semaphore(3)
+
async def _do(url: str) -> Recipe | None:
- try:
- recipe, _ = await create_from_url(url)
- return recipe
- except Exception as e:
- self.service.logger.error(f"failed to scrape url during bulk url import {b.url}")
- self.service.logger.exception(e)
- self._add_error_entry(f"failed to scrape url {url}", str(e))
- return None
+ async with sem:
+ try:
+ recipe, _ = await create_from_url(url)
+ return recipe
+ except Exception as e:
+ self.service.logger.error(f"failed to scrape url during bulk url import {url}")
+ self.service.logger.exception(e)
+ self._add_error_entry(f"failed to scrape url {url}", str(e))
+ return None
if self.report is None:
self.get_report_id()
tasks = [_do(b.url) for b in urls.imports]
- results = await gather(*tasks)
+ results = await asyncio.gather(*tasks, return_exceptions=True)
for b, recipe in zip(urls.imports, results, strict=True):
- if not recipe:
+ if not recipe or isinstance(recipe, Exception):
continue
if b.tags:
diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py
index cd2435192366..fd51498023a4 100644
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@@ -172,7 +172,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
try:
scraped_schema = scrape_html(recipe_html, org_url=self.url)
except (NoSchemaFoundInWildMode, AttributeError):
- self.logger.error("Recipe Scraper was unable to extract a recipe.")
+ self.logger.error(f"Recipe Scraper was unable to extract a recipe from {self.url}")
return None
except ConnectionError as e: