mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-06-09 00:25:43 -04:00
perf: use HEAD method to determine best image (#1614)
This commit is contained in:
parent
4a0fb56d18
commit
9ea5e6584f
@ -1,4 +1,6 @@
|
|||||||
|
import asyncio
|
||||||
import shutil
|
import shutil
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -11,6 +13,30 @@ from mealie.services._base_service import BaseService
|
|||||||
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
|
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
|
||||||
|
|
||||||
|
|
||||||
|
async def largest_content_len(urls: list[str]) -> tuple[str, int]:
|
||||||
|
largest_url = ""
|
||||||
|
largest_len = 0
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
|
with requests.Session() as session:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
tasks = [
|
||||||
|
loop.run_in_executor(executor, lambda: session.head(url, headers={"User-Agent": _FIREFOX_UA}))
|
||||||
|
for url in urls
|
||||||
|
]
|
||||||
|
|
||||||
|
response: requests.Response # required for type hinting within the loop
|
||||||
|
for response in await asyncio.gather(*tasks):
|
||||||
|
|
||||||
|
len_int = int(response.headers.get("Content-Length", 0))
|
||||||
|
if len_int > largest_len:
|
||||||
|
largest_url = response.url
|
||||||
|
largest_len = len_int
|
||||||
|
|
||||||
|
return largest_url, largest_len
|
||||||
|
|
||||||
|
|
||||||
class NotAnImageError(Exception):
|
class NotAnImageError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -93,19 +119,14 @@ class RecipeDataService(BaseService):
|
|||||||
# Typically would be in smallest->biggest order, but can't be certain so test each.
|
# Typically would be in smallest->biggest order, but can't be certain so test each.
|
||||||
# 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.'
|
# 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.'
|
||||||
|
|
||||||
all_image_requests = []
|
# TODO: We should refactor the scraper to use a async session provided by FastAPI using a sync
|
||||||
for url in image_url:
|
# route instead of bootstrapping async behavior this far down the chain. Will require some work
|
||||||
if isinstance(url, dict):
|
# so leaving this improvement here for now.
|
||||||
url = url.get("url", "")
|
loop = asyncio.new_event_loop()
|
||||||
try:
|
asyncio.set_event_loop(loop)
|
||||||
r = requests.get(url, stream=True, headers={"User-Agent": _FIREFOX_UA})
|
future = asyncio.ensure_future(largest_content_len(image_url))
|
||||||
except Exception:
|
loop.run_until_complete(future)
|
||||||
self.logger.exception(f"Image {url} could not be requested")
|
image_url, _ = future.result()
|
||||||
continue
|
|
||||||
if r.status_code == 200:
|
|
||||||
all_image_requests.append((url, r))
|
|
||||||
|
|
||||||
image_url, _ = max(all_image_requests, key=lambda url_r: len(url_r[1].content), default=("", 0))
|
|
||||||
|
|
||||||
elif isinstance(image_url, dict): # Handles Dictionary Types
|
elif isinstance(image_url, dict): # Handles Dictionary Types
|
||||||
for key in image_url:
|
for key in image_url:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user