mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-07-09 03:04:54 -04:00
Fix the recipe scraper debugger (#736)
* Fix recipe debugger * Remove scrape-schema-recipe from dependencies * Fix breaking tests
This commit is contained in:
parent
75113cc2c7
commit
3831eef508
@ -16,8 +16,7 @@ from mealie.schema.user import UserInDB
|
||||
from mealie.services.events import create_recipe_event
|
||||
from mealie.services.image.image import scrape_image, write_image
|
||||
from mealie.services.recipe.media import check_assets, delete_assets
|
||||
from mealie.services.scraper.scraper import create_from_url
|
||||
from scrape_schema_recipe import scrape_url
|
||||
from mealie.services.scraper.scraper import create_from_url, scrape_from_url
|
||||
from slugify import slugify
|
||||
from sqlalchemy.orm.session import Session
|
||||
from starlette.responses import FileResponse
|
||||
@ -50,7 +49,11 @@ def create_from_json(
|
||||
|
||||
@user_router.post("/test-scrape-url")
|
||||
def test_parse_recipe_url(url: RecipeURLIn):
|
||||
return scrape_url(url.url)
|
||||
# Debugger should produce the same result as the scraper sees before cleaning
|
||||
scraped_data = scrape_from_url(url.url)
|
||||
if scraped_data:
|
||||
return scraped_data.schema.data
|
||||
return "recipe_scrapers was unable to scrape this URL"
|
||||
|
||||
|
||||
@user_router.post("/create-url", status_code=201, response_model=str)
|
||||
|
@ -1,6 +1,6 @@
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import Any, Callable
|
||||
from typing import Any, Callable, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
import requests
|
||||
@ -29,7 +29,14 @@ def create_from_url(url: str) -> Recipe:
|
||||
Returns:
|
||||
Recipe: Recipe Object
|
||||
"""
|
||||
new_recipe = scrape_from_url(url)
|
||||
# Try the different scrapers in order.
|
||||
if scraped_data := scrape_from_url(url):
|
||||
new_recipe = clean_scraper(scraped_data, url)
|
||||
elif og_dict := extract_open_graph_values(url):
|
||||
new_recipe = Recipe(**og_dict)
|
||||
else:
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
|
||||
|
||||
logger.info(f"Image {new_recipe.image}")
|
||||
new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
|
||||
|
||||
@ -46,16 +53,17 @@ class ParserErrors(str, Enum):
|
||||
CONNECTION_ERROR = "CONNECTION_ERROR"
|
||||
|
||||
|
||||
def extract_open_graph_values(url) -> Recipe:
|
||||
def extract_open_graph_values(url) -> Optional[dict]:
|
||||
r = requests.get(url)
|
||||
recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
|
||||
|
||||
return Recipe(**recipe)
|
||||
if recipe.get("name", "") == "":
|
||||
return None
|
||||
return recipe
|
||||
|
||||
|
||||
def scrape_from_url(url: str) -> Recipe:
|
||||
"""Entry function to generating are recipe obejct from a url
|
||||
This will determine if a url can be parsed and raise an appropriate error keyword
|
||||
def scrape_from_url(url: str):
|
||||
"""Entry function to scrape a recipe from a url
|
||||
This will determine if a url can be parsed and return None if not, to allow another parser to try.
|
||||
This keyword is used on the frontend to reference a localized string to present on the UI.
|
||||
|
||||
Args:
|
||||
@ -65,7 +73,7 @@ def scrape_from_url(url: str) -> Recipe:
|
||||
HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
|
||||
|
||||
Returns:
|
||||
Recipe: Recipe Model
|
||||
Optional[Scraped schema for cleaning]
|
||||
"""
|
||||
try:
|
||||
scraped_schema = scrape_me(url)
|
||||
@ -73,28 +81,26 @@ def scrape_from_url(url: str) -> Recipe:
|
||||
try:
|
||||
scraped_schema = scrape_me(url, wild_mode=True)
|
||||
except (NoSchemaFoundInWildMode, AttributeError):
|
||||
recipe = extract_open_graph_values(url)
|
||||
if recipe.name != "":
|
||||
return recipe
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
|
||||
# Recipe_scraper was unable to extract a recipe.
|
||||
return None
|
||||
|
||||
except ConnectionError:
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value})
|
||||
|
||||
# Check to see if the recipe is valid
|
||||
try:
|
||||
ingredients = scraped_schema.ingredients()
|
||||
instruct = scraped_schema.instructions()
|
||||
except Exception:
|
||||
ingredients = []
|
||||
instruct = []
|
||||
|
||||
try:
|
||||
ing = scraped_schema.ingredients()
|
||||
except Exception:
|
||||
ing = []
|
||||
if instruct and ingredients:
|
||||
return scraped_schema
|
||||
|
||||
if not instruct and not ing:
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value})
|
||||
else:
|
||||
return clean_scraper(scraped_schema, url)
|
||||
# recipe_scrapers did not get a valid recipe.
|
||||
# Return None to let another scraper try.
|
||||
return None
|
||||
|
||||
|
||||
def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
|
||||
|
60
poetry.lock
generated
60
poetry.lock
generated
@ -229,14 +229,6 @@ sdist = ["setuptools-rust (>=0.11.4)"]
|
||||
ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["pytest (>=6.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "decorator"
|
||||
version = "5.1.0"
|
||||
description = "Decorators for Humans"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
|
||||
[[package]]
|
||||
name = "ecdsa"
|
||||
version = "0.17.0"
|
||||
@ -329,7 +321,7 @@ python-versions = "*"
|
||||
python-dateutil = ">=2.8.1"
|
||||
|
||||
[package.extras]
|
||||
dev = ["twine", "markdown", "flake8"]
|
||||
dev = ["twine", "markdown", "flake8", "wheel"]
|
||||
|
||||
[[package]]
|
||||
name = "greenlet"
|
||||
@ -1045,20 +1037,6 @@ python-versions = ">=3.5, <4"
|
||||
[package.dependencies]
|
||||
pyasn1 = ">=0.1.3"
|
||||
|
||||
[[package]]
|
||||
name = "scrape-schema-recipe"
|
||||
version = "0.1.5"
|
||||
description = "Extracts cooking recipe from HTML structured data in the https://schema.org/Recipe format."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[package.dependencies]
|
||||
extruct = "*"
|
||||
isodate = ">=0.5.1"
|
||||
requests = "*"
|
||||
validators = ">=0.12.4"
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
version = "1.16.0"
|
||||
@ -1208,21 +1186,6 @@ dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=3.6.0)", "Sphinx (>=4.1.2,<4.2.0
|
||||
docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"]
|
||||
test = ["aiohttp", "flake8 (>=3.9.2,<3.10.0)", "psutil", "pycodestyle (>=2.7.0,<2.8.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"]
|
||||
|
||||
[[package]]
|
||||
name = "validators"
|
||||
version = "0.18.2"
|
||||
description = "Python Data Validation for Humans™."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.4"
|
||||
|
||||
[package.dependencies]
|
||||
decorator = ">=3.4.0"
|
||||
six = ">=1.4.0"
|
||||
|
||||
[package.extras]
|
||||
test = ["pytest (>=2.2.3)", "flake8 (>=2.4.0)", "isort (>=4.2.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "w3lib"
|
||||
version = "1.22.0"
|
||||
@ -1295,7 +1258,7 @@ pgsql = ["psycopg2-binary"]
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "0cafb5f1d3d4ca7b1f27acc107bb1b2bc07d87b3f09e589351b8963ce7fed006"
|
||||
content-hash = "2e7a7366808044d28f47c02bab618d21a3c92bb028e30b9f5ca5232b388f2696"
|
||||
|
||||
[metadata.files]
|
||||
aiofiles = [
|
||||
@ -1470,6 +1433,7 @@ cryptography = [
|
||||
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085"},
|
||||
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b"},
|
||||
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb"},
|
||||
{file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d"},
|
||||
{file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89"},
|
||||
{file = "cryptography-3.4.8-cp36-abi3-win32.whl", hash = "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7"},
|
||||
{file = "cryptography-3.4.8-cp36-abi3-win_amd64.whl", hash = "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc"},
|
||||
@ -1483,10 +1447,6 @@ cryptography = [
|
||||
{file = "cryptography-3.4.8-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e"},
|
||||
{file = "cryptography-3.4.8.tar.gz", hash = "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c"},
|
||||
]
|
||||
decorator = [
|
||||
{file = "decorator-5.1.0-py3-none-any.whl", hash = "sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374"},
|
||||
{file = "decorator-5.1.0.tar.gz", hash = "sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7"},
|
||||
]
|
||||
ecdsa = [
|
||||
{file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"},
|
||||
{file = "ecdsa-0.17.0.tar.gz", hash = "sha256:b9f500bb439e4153d0330610f5d26baaf18d17b8ced1bc54410d189385ea68aa"},
|
||||
@ -1508,6 +1468,7 @@ flake8 = [
|
||||
]
|
||||
ghp-import = [
|
||||
{file = "ghp-import-2.0.1.tar.gz", hash = "sha256:753de2eace6e0f7d4edfb3cce5e3c3b98cd52aadb80163303d1d036bda7b4483"},
|
||||
{file = "ghp_import-2.0.1-py3-none-any.whl", hash = "sha256:8241a8e9f8dd3c1fafe9696e6e081b57a208ef907e9939c44e7415e407ab40ea"},
|
||||
]
|
||||
greenlet = [
|
||||
{file = "greenlet-1.1.1-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:476ba9435afaead4382fbab8f1882f75e3fb2285c35c9285abb3dd30237f9142"},
|
||||
@ -1856,6 +1817,11 @@ pluggy = [
|
||||
]
|
||||
psycopg2-binary = [
|
||||
{file = "psycopg2-binary-2.9.1.tar.gz", hash = "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773"},
|
||||
{file = "psycopg2_binary-2.9.1-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f"},
|
||||
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759"},
|
||||
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e"},
|
||||
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a"},
|
||||
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c"},
|
||||
{file = "psycopg2_binary-2.9.1-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76"},
|
||||
{file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698"},
|
||||
{file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616"},
|
||||
@ -2097,10 +2063,6 @@ rsa = [
|
||||
{file = "rsa-4.7.2-py3-none-any.whl", hash = "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2"},
|
||||
{file = "rsa-4.7.2.tar.gz", hash = "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"},
|
||||
]
|
||||
scrape-schema-recipe = [
|
||||
{file = "scrape-schema-recipe-0.1.5.tar.gz", hash = "sha256:aa49499389f045905a31426517c98a8a66e156b2744fe9cb62b636ffc4b3ce0f"},
|
||||
{file = "scrape_schema_recipe-0.1.5-py2.py3-none-any.whl", hash = "sha256:ac9173a9f05711ba5bcbf98546c4c36e8bc76ba94827e3ddb2472a2b8052960c"},
|
||||
]
|
||||
six = [
|
||||
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
|
||||
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||
@ -2220,10 +2182,6 @@ uvloop = [
|
||||
{file = "uvloop-0.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e5f2e2ff51aefe6c19ee98af12b4ae61f5be456cd24396953244a30880ad861"},
|
||||
{file = "uvloop-0.16.0.tar.gz", hash = "sha256:f74bc20c7b67d1c27c72601c78cf95be99d5c2cdd4514502b4f3eb0933ff1228"},
|
||||
]
|
||||
validators = [
|
||||
{file = "validators-0.18.2-py3-none-any.whl", hash = "sha256:0143dcca8a386498edaf5780cbd5960da1a4c85e0719f3ee5c9b41249c4fefbd"},
|
||||
{file = "validators-0.18.2.tar.gz", hash = "sha256:37cd9a9213278538ad09b5b9f9134266e7c226ab1fede1d500e29e0a8fbb9ea6"},
|
||||
]
|
||||
w3lib = [
|
||||
{file = "w3lib-1.22.0-py2.py3-none-any.whl", hash = "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53"},
|
||||
{file = "w3lib-1.22.0.tar.gz", hash = "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df"},
|
||||
|
@ -23,7 +23,6 @@ python-slugify = "^4.0.1"
|
||||
requests = "^2.25.1"
|
||||
PyYAML = "^5.3.1"
|
||||
extruct = "^0.12.0"
|
||||
scrape-schema-recipe = "^0.1.3"
|
||||
python-multipart = "^0.0.5"
|
||||
fastapi-camelcase = "^1.0.2"
|
||||
bcrypt = "^3.2.0"
|
||||
|
Loading…
x
Reference in New Issue
Block a user