diff --git a/mealie/routes/recipe/recipe_crud_routes.py b/mealie/routes/recipe/recipe_crud_routes.py index 8e764096456a..e3360f84297c 100644 --- a/mealie/routes/recipe/recipe_crud_routes.py +++ b/mealie/routes/recipe/recipe_crud_routes.py @@ -16,8 +16,7 @@ from mealie.schema.user import UserInDB from mealie.services.events import create_recipe_event from mealie.services.image.image import scrape_image, write_image from mealie.services.recipe.media import check_assets, delete_assets -from mealie.services.scraper.scraper import create_from_url -from scrape_schema_recipe import scrape_url +from mealie.services.scraper.scraper import create_from_url, scrape_from_url from slugify import slugify from sqlalchemy.orm.session import Session from starlette.responses import FileResponse @@ -50,7 +49,11 @@ def create_from_json( @user_router.post("/test-scrape-url") def test_parse_recipe_url(url: RecipeURLIn): - return scrape_url(url.url) + # Debugger should produce the same result as the scraper sees before cleaning + scraped_data = scrape_from_url(url.url) + if scraped_data: + return scraped_data.schema.data + return "recipe_scrapers was unable to scrape this URL" @user_router.post("/create-url", status_code=201, response_model=str) diff --git a/mealie/services/scraper/scraper.py b/mealie/services/scraper/scraper.py index c066893ee2da..66b928f9efbc 100644 --- a/mealie/services/scraper/scraper.py +++ b/mealie/services/scraper/scraper.py @@ -1,6 +1,6 @@ import json from enum import Enum -from typing import Any, Callable +from typing import Any, Callable, Optional from uuid import uuid4 import requests @@ -29,7 +29,14 @@ def create_from_url(url: str) -> Recipe: Returns: Recipe: Recipe Object """ - new_recipe = scrape_from_url(url) + # Try the different scrapers in order. + if scraped_data := scrape_from_url(url): + new_recipe = clean_scraper(scraped_data, url) + elif og_dict := extract_open_graph_values(url): + new_recipe = Recipe(**og_dict) + else: + raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value}) + logger.info(f"Image {new_recipe.image}") new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image) @@ -46,16 +53,17 @@ class ParserErrors(str, Enum): CONNECTION_ERROR = "CONNECTION_ERROR" -def extract_open_graph_values(url) -> Recipe: +def extract_open_graph_values(url) -> Optional[dict]: r = requests.get(url) recipe = open_graph.basic_recipe_from_opengraph(r.text, url) - - return Recipe(**recipe) + if recipe.get("name", "") == "": + return None + return recipe -def scrape_from_url(url: str) -> Recipe: - """Entry function to generating are recipe obejct from a url - This will determine if a url can be parsed and raise an appropriate error keyword +def scrape_from_url(url: str): + """Entry function to scrape a recipe from a url + This will determine if a url can be parsed and return None if not, to allow another parser to try. This keyword is used on the frontend to reference a localized string to present on the UI. Args: @@ -65,7 +73,7 @@ def scrape_from_url(url: str) -> Recipe: HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details Returns: - Recipe: Recipe Model + Optional[Scraped schema for cleaning] """ try: scraped_schema = scrape_me(url) @@ -73,28 +81,26 @@ def scrape_from_url(url: str) -> Recipe: try: scraped_schema = scrape_me(url, wild_mode=True) except (NoSchemaFoundInWildMode, AttributeError): - recipe = extract_open_graph_values(url) - if recipe.name != "": - return recipe - raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value}) + # Recipe_scraper was unable to extract a recipe. + return None except ConnectionError: raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value}) + # Check to see if the recipe is valid try: + ingredients = scraped_schema.ingredients() instruct = scraped_schema.instructions() except Exception: + ingredients = [] instruct = [] - try: - ing = scraped_schema.ingredients() - except Exception: - ing = [] + if instruct and ingredients: + return scraped_schema - if not instruct and not ing: - raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value}) - else: - return clean_scraper(scraped_schema, url) + # recipe_scrapers did not get a valid recipe. + # Return None to let another scraper try. + return None def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe: diff --git a/poetry.lock b/poetry.lock index 6c6da07f5998..6d0428efc416 100644 --- a/poetry.lock +++ b/poetry.lock @@ -229,14 +229,6 @@ sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] test = ["pytest (>=6.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] -[[package]] -name = "decorator" -version = "5.1.0" -description = "Decorators for Humans" -category = "main" -optional = false -python-versions = ">=3.5" - [[package]] name = "ecdsa" version = "0.17.0" @@ -329,7 +321,7 @@ python-versions = "*" python-dateutil = ">=2.8.1" [package.extras] -dev = ["twine", "markdown", "flake8"] +dev = ["twine", "markdown", "flake8", "wheel"] [[package]] name = "greenlet" @@ -1045,20 +1037,6 @@ python-versions = ">=3.5, <4" [package.dependencies] pyasn1 = ">=0.1.3" -[[package]] -name = "scrape-schema-recipe" -version = "0.1.5" -description = "Extracts cooking recipe from HTML structured data in the https://schema.org/Recipe format." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -extruct = "*" -isodate = ">=0.5.1" -requests = "*" -validators = ">=0.12.4" - [[package]] name = "six" version = "1.16.0" @@ -1208,21 +1186,6 @@ dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=3.6.0)", "Sphinx (>=4.1.2,<4.2.0 docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] test = ["aiohttp", "flake8 (>=3.9.2,<3.10.0)", "psutil", "pycodestyle (>=2.7.0,<2.8.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"] -[[package]] -name = "validators" -version = "0.18.2" -description = "Python Data Validation for Humans™." -category = "main" -optional = false -python-versions = ">=3.4" - -[package.dependencies] -decorator = ">=3.4.0" -six = ">=1.4.0" - -[package.extras] -test = ["pytest (>=2.2.3)", "flake8 (>=2.4.0)", "isort (>=4.2.2)"] - [[package]] name = "w3lib" version = "1.22.0" @@ -1295,7 +1258,7 @@ pgsql = ["psycopg2-binary"] [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "0cafb5f1d3d4ca7b1f27acc107bb1b2bc07d87b3f09e589351b8963ce7fed006" +content-hash = "2e7a7366808044d28f47c02bab618d21a3c92bb028e30b9f5ca5232b388f2696" [metadata.files] aiofiles = [ @@ -1470,6 +1433,7 @@ cryptography = [ {file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085"}, {file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b"}, {file = "cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb"}, + {file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d"}, {file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89"}, {file = "cryptography-3.4.8-cp36-abi3-win32.whl", hash = "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7"}, {file = "cryptography-3.4.8-cp36-abi3-win_amd64.whl", hash = "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc"}, @@ -1483,10 +1447,6 @@ cryptography = [ {file = "cryptography-3.4.8-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e"}, {file = "cryptography-3.4.8.tar.gz", hash = "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c"}, ] -decorator = [ - {file = "decorator-5.1.0-py3-none-any.whl", hash = "sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374"}, - {file = "decorator-5.1.0.tar.gz", hash = "sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7"}, -] ecdsa = [ {file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"}, {file = "ecdsa-0.17.0.tar.gz", hash = "sha256:b9f500bb439e4153d0330610f5d26baaf18d17b8ced1bc54410d189385ea68aa"}, @@ -1508,6 +1468,7 @@ flake8 = [ ] ghp-import = [ {file = "ghp-import-2.0.1.tar.gz", hash = "sha256:753de2eace6e0f7d4edfb3cce5e3c3b98cd52aadb80163303d1d036bda7b4483"}, + {file = "ghp_import-2.0.1-py3-none-any.whl", hash = "sha256:8241a8e9f8dd3c1fafe9696e6e081b57a208ef907e9939c44e7415e407ab40ea"}, ] greenlet = [ {file = "greenlet-1.1.1-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:476ba9435afaead4382fbab8f1882f75e3fb2285c35c9285abb3dd30237f9142"}, @@ -1856,6 +1817,11 @@ pluggy = [ ] psycopg2-binary = [ {file = "psycopg2-binary-2.9.1.tar.gz", hash = "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773"}, + {file = "psycopg2_binary-2.9.1-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f"}, + {file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759"}, + {file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e"}, + {file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a"}, + {file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c"}, {file = "psycopg2_binary-2.9.1-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76"}, {file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698"}, {file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616"}, @@ -2097,10 +2063,6 @@ rsa = [ {file = "rsa-4.7.2-py3-none-any.whl", hash = "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2"}, {file = "rsa-4.7.2.tar.gz", hash = "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"}, ] -scrape-schema-recipe = [ - {file = "scrape-schema-recipe-0.1.5.tar.gz", hash = "sha256:aa49499389f045905a31426517c98a8a66e156b2744fe9cb62b636ffc4b3ce0f"}, - {file = "scrape_schema_recipe-0.1.5-py2.py3-none-any.whl", hash = "sha256:ac9173a9f05711ba5bcbf98546c4c36e8bc76ba94827e3ddb2472a2b8052960c"}, -] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -2220,10 +2182,6 @@ uvloop = [ {file = "uvloop-0.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e5f2e2ff51aefe6c19ee98af12b4ae61f5be456cd24396953244a30880ad861"}, {file = "uvloop-0.16.0.tar.gz", hash = "sha256:f74bc20c7b67d1c27c72601c78cf95be99d5c2cdd4514502b4f3eb0933ff1228"}, ] -validators = [ - {file = "validators-0.18.2-py3-none-any.whl", hash = "sha256:0143dcca8a386498edaf5780cbd5960da1a4c85e0719f3ee5c9b41249c4fefbd"}, - {file = "validators-0.18.2.tar.gz", hash = "sha256:37cd9a9213278538ad09b5b9f9134266e7c226ab1fede1d500e29e0a8fbb9ea6"}, -] w3lib = [ {file = "w3lib-1.22.0-py2.py3-none-any.whl", hash = "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53"}, {file = "w3lib-1.22.0.tar.gz", hash = "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df"}, diff --git a/pyproject.toml b/pyproject.toml index 2f02353ce59a..9dfd0d996c52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,6 @@ python-slugify = "^4.0.1" requests = "^2.25.1" PyYAML = "^5.3.1" extruct = "^0.12.0" -scrape-schema-recipe = "^0.1.3" python-multipart = "^0.0.5" fastapi-camelcase = "^1.0.2" bcrypt = "^3.2.0"