fix: Bulk URL Import Fixes (#2796)

* allow expections when fetching content * removed extra bracket on import text * added more fault tolerance and limited concurrency * fix entries not being saved to report * disable clicking into in-proress import * conditionally render expansion
2025-07-09 03:04:54 -04:00 · 2023-12-07 11:08:47 -06:00 · 2023-12-07 11:08:47 -06:00 · f8ad72ec31
commit f8ad72ec31
parent 449bb6f0ce
6 changed files with 37 additions and 19 deletions
--- a/frontend/components/global/ReportTable.vue
+++ b/frontend/components/global/ReportTable.vue
@ -49,6 +49,10 @@ export default defineComponent({
    ];

    function handleRowClick(item: ReportSummary) {
+      if (item.status === "in-progress") {
+        return;
+      }
+
      router.push(`/group/reports/${item.id}`);
    }

--- a/frontend/pages/group/data/recipes.vue
+++ b/frontend/pages/group/data/recipes.vue
@ -121,7 +121,7 @@
            <template #icon>
              {{ $globals.icons.database }}
            </template>
-            {{ $t('general.import') }}}
+            {{ $t('general.import') }}
          </BaseButton>
          <BaseButton
            color="info"
--- a/frontend/pages/group/reports/_id.vue
+++ b/frontend/pages/group/reports/_id.vue
@ -21,7 +21,7 @@
          {{ $d(Date.parse(item.timestamp), "short") }}
        </template>
        <template #expanded-item="{ headers, item }">
-          <td class="pa-6" :colspan="headers.length">{{ item.exception }}</td>
+          <td v-if="item.exception" class="pa-6" :colspan="headers.length">{{ item.exception }}</td>
        </template>
      </v-data-table>
    </v-container>
--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@ -12,14 +12,17 @@ from mealie.services._base_service import BaseService
 _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"


-async def gather_with_concurrency(n, *coros):
+async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
    semaphore = asyncio.Semaphore(n)

    async def sem_coro(coro):
        async with semaphore:
            return await coro

-    return await asyncio.gather(*(sem_coro(c) for c in coros))
+    results = await asyncio.gather(*(sem_coro(c) for c in coros), return_exceptions=ignore_exceptions)
+    if ignore_exceptions:
+        results = [r for r in results if not isinstance(r, Exception)]
+    return results


 async def largest_content_len(urls: list[str]) -> tuple[str, int]:
@ -31,7 +34,7 @@ async def largest_content_len(urls: list[str]) -> tuple[str, int]:

    async with AsyncClient() as client:
        tasks = [do(client, url) for url in urls]
-        responses: list[Response] = await gather_with_concurrency(10, *tasks)
+        responses: list[Response] = await gather_with_concurrency(10, *tasks, ignore_exceptions=True)
        for response in responses:
            len_int = int(response.headers.get("Content-Length", 0))
            if len_int > largest_len:
--- a/mealie/services/scraper/recipe_bulk_scraper.py
+++ b/mealie/services/scraper/recipe_bulk_scraper.py
@ -1,10 +1,16 @@
-from asyncio import gather
+import asyncio

 from pydantic import UUID4

 from mealie.repos.repository_factory import AllRepositories
 from mealie.schema.recipe.recipe import CreateRecipeByUrlBulk, Recipe
-from mealie.schema.reports.reports import ReportCategory, ReportCreate, ReportEntryCreate, ReportSummaryStatus
+from mealie.schema.reports.reports import (
+    ReportCategory,
+    ReportCreate,
+    ReportEntryCreate,
+    ReportEntryOut,
+    ReportSummaryStatus,
+)
 from mealie.schema.user.user import GroupInDB
 from mealie.services._base_service import BaseService
 from mealie.services.recipe.recipe_service import RecipeService
@ -47,6 +53,7 @@ class RecipeBulkScraperService(BaseService):
        is_success = True
        is_failure = True

+        new_entries: list[ReportEntryOut] = []
        for entry in self.report_entries:
            if is_failure and entry.success:
                is_failure = False
@ -54,7 +61,7 @@ class RecipeBulkScraperService(BaseService):
            if is_success and not entry.success:
                is_success = False

-            self.repos.group_report_entries.create(entry)
+            new_entries.append(self.repos.group_report_entries.create(entry))

        if is_success:
            self.report.status = ReportSummaryStatus.success
@ -65,15 +72,19 @@ class RecipeBulkScraperService(BaseService):
        if not is_success and not is_failure:
            self.report.status = ReportSummaryStatus.partial

+        self.report.entries = new_entries
        self.repos.group_reports.update(self.report.id, self.report)

    async def scrape(self, urls: CreateRecipeByUrlBulk) -> None:
+        sem = asyncio.Semaphore(3)
+
        async def _do(url: str) -> Recipe | None:
+            async with sem:
                try:
                    recipe, _ = await create_from_url(url)
                    return recipe
                except Exception as e:
-                self.service.logger.error(f"failed to scrape url during bulk url import {b.url}")
+                    self.service.logger.error(f"failed to scrape url during bulk url import {url}")
                    self.service.logger.exception(e)
                    self._add_error_entry(f"failed to scrape url {url}", str(e))
                    return None
@ -81,9 +92,9 @@ class RecipeBulkScraperService(BaseService):
        if self.report is None:
            self.get_report_id()
        tasks = [_do(b.url) for b in urls.imports]
-        results = await gather(*tasks)
+        results = await asyncio.gather(*tasks, return_exceptions=True)
        for b, recipe in zip(urls.imports, results, strict=True):
-            if not recipe:
+            if not recipe or isinstance(recipe, Exception):
                continue

            if b.tags:
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@ -172,7 +172,7 @@ class RecipeScraperPackage(ABCScraperStrategy):
        try:
            scraped_schema = scrape_html(recipe_html, org_url=self.url)
        except (NoSchemaFoundInWildMode, AttributeError):
-            self.logger.error("Recipe Scraper was unable to extract a recipe.")
+            self.logger.error(f"Recipe Scraper was unable to extract a recipe from {self.url}")
            return None

        except ConnectionError as e: