Fix the recipe scraper debugger (#736)

* Fix recipe debugger

* Remove scrape-schema-recipe from dependencies

* Fix breaking tests
This commit is contained in:
cadamswaite 2021-10-20 01:01:35 +01:00 committed by GitHub
parent 75113cc2c7
commit 3831eef508
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 76 deletions

View File

@ -16,8 +16,7 @@ from mealie.schema.user import UserInDB
from mealie.services.events import create_recipe_event from mealie.services.events import create_recipe_event
from mealie.services.image.image import scrape_image, write_image from mealie.services.image.image import scrape_image, write_image
from mealie.services.recipe.media import check_assets, delete_assets from mealie.services.recipe.media import check_assets, delete_assets
from mealie.services.scraper.scraper import create_from_url from mealie.services.scraper.scraper import create_from_url, scrape_from_url
from scrape_schema_recipe import scrape_url
from slugify import slugify from slugify import slugify
from sqlalchemy.orm.session import Session from sqlalchemy.orm.session import Session
from starlette.responses import FileResponse from starlette.responses import FileResponse
@ -50,7 +49,11 @@ def create_from_json(
@user_router.post("/test-scrape-url") @user_router.post("/test-scrape-url")
def test_parse_recipe_url(url: RecipeURLIn): def test_parse_recipe_url(url: RecipeURLIn):
return scrape_url(url.url) # Debugger should produce the same result as the scraper sees before cleaning
scraped_data = scrape_from_url(url.url)
if scraped_data:
return scraped_data.schema.data
return "recipe_scrapers was unable to scrape this URL"
@user_router.post("/create-url", status_code=201, response_model=str) @user_router.post("/create-url", status_code=201, response_model=str)

View File

@ -1,6 +1,6 @@
import json import json
from enum import Enum from enum import Enum
from typing import Any, Callable from typing import Any, Callable, Optional
from uuid import uuid4 from uuid import uuid4
import requests import requests
@ -29,7 +29,14 @@ def create_from_url(url: str) -> Recipe:
Returns: Returns:
Recipe: Recipe Object Recipe: Recipe Object
""" """
new_recipe = scrape_from_url(url) # Try the different scrapers in order.
if scraped_data := scrape_from_url(url):
new_recipe = clean_scraper(scraped_data, url)
elif og_dict := extract_open_graph_values(url):
new_recipe = Recipe(**og_dict)
else:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
logger.info(f"Image {new_recipe.image}") logger.info(f"Image {new_recipe.image}")
new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image) new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
@ -46,16 +53,17 @@ class ParserErrors(str, Enum):
CONNECTION_ERROR = "CONNECTION_ERROR" CONNECTION_ERROR = "CONNECTION_ERROR"
def extract_open_graph_values(url) -> Recipe: def extract_open_graph_values(url) -> Optional[dict]:
r = requests.get(url) r = requests.get(url)
recipe = open_graph.basic_recipe_from_opengraph(r.text, url) recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
if recipe.get("name", "") == "":
return Recipe(**recipe) return None
return recipe
def scrape_from_url(url: str) -> Recipe: def scrape_from_url(url: str):
"""Entry function to generating are recipe obejct from a url """Entry function to scrape a recipe from a url
This will determine if a url can be parsed and raise an appropriate error keyword This will determine if a url can be parsed and return None if not, to allow another parser to try.
This keyword is used on the frontend to reference a localized string to present on the UI. This keyword is used on the frontend to reference a localized string to present on the UI.
Args: Args:
@ -65,7 +73,7 @@ def scrape_from_url(url: str) -> Recipe:
HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
Returns: Returns:
Recipe: Recipe Model Optional[Scraped schema for cleaning]
""" """
try: try:
scraped_schema = scrape_me(url) scraped_schema = scrape_me(url)
@ -73,28 +81,26 @@ def scrape_from_url(url: str) -> Recipe:
try: try:
scraped_schema = scrape_me(url, wild_mode=True) scraped_schema = scrape_me(url, wild_mode=True)
except (NoSchemaFoundInWildMode, AttributeError): except (NoSchemaFoundInWildMode, AttributeError):
recipe = extract_open_graph_values(url) # Recipe_scraper was unable to extract a recipe.
if recipe.name != "": return None
return recipe
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
except ConnectionError: except ConnectionError:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value}) raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value})
# Check to see if the recipe is valid
try: try:
ingredients = scraped_schema.ingredients()
instruct = scraped_schema.instructions() instruct = scraped_schema.instructions()
except Exception: except Exception:
ingredients = []
instruct = [] instruct = []
try: if instruct and ingredients:
ing = scraped_schema.ingredients() return scraped_schema
except Exception:
ing = []
if not instruct and not ing: # recipe_scrapers did not get a valid recipe.
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value}) # Return None to let another scraper try.
else: return None
return clean_scraper(scraped_schema, url)
def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe: def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:

60
poetry.lock generated
View File

@ -229,14 +229,6 @@ sdist = ["setuptools-rust (>=0.11.4)"]
ssh = ["bcrypt (>=3.1.5)"] ssh = ["bcrypt (>=3.1.5)"]
test = ["pytest (>=6.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] test = ["pytest (>=6.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"]
[[package]]
name = "decorator"
version = "5.1.0"
description = "Decorators for Humans"
category = "main"
optional = false
python-versions = ">=3.5"
[[package]] [[package]]
name = "ecdsa" name = "ecdsa"
version = "0.17.0" version = "0.17.0"
@ -329,7 +321,7 @@ python-versions = "*"
python-dateutil = ">=2.8.1" python-dateutil = ">=2.8.1"
[package.extras] [package.extras]
dev = ["twine", "markdown", "flake8"] dev = ["twine", "markdown", "flake8", "wheel"]
[[package]] [[package]]
name = "greenlet" name = "greenlet"
@ -1045,20 +1037,6 @@ python-versions = ">=3.5, <4"
[package.dependencies] [package.dependencies]
pyasn1 = ">=0.1.3" pyasn1 = ">=0.1.3"
[[package]]
name = "scrape-schema-recipe"
version = "0.1.5"
description = "Extracts cooking recipe from HTML structured data in the https://schema.org/Recipe format."
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
extruct = "*"
isodate = ">=0.5.1"
requests = "*"
validators = ">=0.12.4"
[[package]] [[package]]
name = "six" name = "six"
version = "1.16.0" version = "1.16.0"
@ -1208,21 +1186,6 @@ dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=3.6.0)", "Sphinx (>=4.1.2,<4.2.0
docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"]
test = ["aiohttp", "flake8 (>=3.9.2,<3.10.0)", "psutil", "pycodestyle (>=2.7.0,<2.8.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"] test = ["aiohttp", "flake8 (>=3.9.2,<3.10.0)", "psutil", "pycodestyle (>=2.7.0,<2.8.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"]
[[package]]
name = "validators"
version = "0.18.2"
description = "Python Data Validation for Humans™."
category = "main"
optional = false
python-versions = ">=3.4"
[package.dependencies]
decorator = ">=3.4.0"
six = ">=1.4.0"
[package.extras]
test = ["pytest (>=2.2.3)", "flake8 (>=2.4.0)", "isort (>=4.2.2)"]
[[package]] [[package]]
name = "w3lib" name = "w3lib"
version = "1.22.0" version = "1.22.0"
@ -1295,7 +1258,7 @@ pgsql = ["psycopg2-binary"]
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "0cafb5f1d3d4ca7b1f27acc107bb1b2bc07d87b3f09e589351b8963ce7fed006" content-hash = "2e7a7366808044d28f47c02bab618d21a3c92bb028e30b9f5ca5232b388f2696"
[metadata.files] [metadata.files]
aiofiles = [ aiofiles = [
@ -1470,6 +1433,7 @@ cryptography = [
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085"}, {file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085"},
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b"}, {file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b"},
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb"}, {file = "cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb"},
{file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d"},
{file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89"}, {file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89"},
{file = "cryptography-3.4.8-cp36-abi3-win32.whl", hash = "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7"}, {file = "cryptography-3.4.8-cp36-abi3-win32.whl", hash = "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7"},
{file = "cryptography-3.4.8-cp36-abi3-win_amd64.whl", hash = "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc"}, {file = "cryptography-3.4.8-cp36-abi3-win_amd64.whl", hash = "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc"},
@ -1483,10 +1447,6 @@ cryptography = [
{file = "cryptography-3.4.8-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e"}, {file = "cryptography-3.4.8-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e"},
{file = "cryptography-3.4.8.tar.gz", hash = "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c"}, {file = "cryptography-3.4.8.tar.gz", hash = "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c"},
] ]
decorator = [
{file = "decorator-5.1.0-py3-none-any.whl", hash = "sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374"},
{file = "decorator-5.1.0.tar.gz", hash = "sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7"},
]
ecdsa = [ ecdsa = [
{file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"}, {file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"},
{file = "ecdsa-0.17.0.tar.gz", hash = "sha256:b9f500bb439e4153d0330610f5d26baaf18d17b8ced1bc54410d189385ea68aa"}, {file = "ecdsa-0.17.0.tar.gz", hash = "sha256:b9f500bb439e4153d0330610f5d26baaf18d17b8ced1bc54410d189385ea68aa"},
@ -1508,6 +1468,7 @@ flake8 = [
] ]
ghp-import = [ ghp-import = [
{file = "ghp-import-2.0.1.tar.gz", hash = "sha256:753de2eace6e0f7d4edfb3cce5e3c3b98cd52aadb80163303d1d036bda7b4483"}, {file = "ghp-import-2.0.1.tar.gz", hash = "sha256:753de2eace6e0f7d4edfb3cce5e3c3b98cd52aadb80163303d1d036bda7b4483"},
{file = "ghp_import-2.0.1-py3-none-any.whl", hash = "sha256:8241a8e9f8dd3c1fafe9696e6e081b57a208ef907e9939c44e7415e407ab40ea"},
] ]
greenlet = [ greenlet = [
{file = "greenlet-1.1.1-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:476ba9435afaead4382fbab8f1882f75e3fb2285c35c9285abb3dd30237f9142"}, {file = "greenlet-1.1.1-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:476ba9435afaead4382fbab8f1882f75e3fb2285c35c9285abb3dd30237f9142"},
@ -1856,6 +1817,11 @@ pluggy = [
] ]
psycopg2-binary = [ psycopg2-binary = [
{file = "psycopg2-binary-2.9.1.tar.gz", hash = "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773"}, {file = "psycopg2-binary-2.9.1.tar.gz", hash = "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c"},
{file = "psycopg2_binary-2.9.1-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76"}, {file = "psycopg2_binary-2.9.1-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76"},
{file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698"}, {file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698"},
{file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616"}, {file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616"},
@ -2097,10 +2063,6 @@ rsa = [
{file = "rsa-4.7.2-py3-none-any.whl", hash = "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2"}, {file = "rsa-4.7.2-py3-none-any.whl", hash = "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2"},
{file = "rsa-4.7.2.tar.gz", hash = "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"}, {file = "rsa-4.7.2.tar.gz", hash = "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"},
] ]
scrape-schema-recipe = [
{file = "scrape-schema-recipe-0.1.5.tar.gz", hash = "sha256:aa49499389f045905a31426517c98a8a66e156b2744fe9cb62b636ffc4b3ce0f"},
{file = "scrape_schema_recipe-0.1.5-py2.py3-none-any.whl", hash = "sha256:ac9173a9f05711ba5bcbf98546c4c36e8bc76ba94827e3ddb2472a2b8052960c"},
]
six = [ six = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@ -2220,10 +2182,6 @@ uvloop = [
{file = "uvloop-0.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e5f2e2ff51aefe6c19ee98af12b4ae61f5be456cd24396953244a30880ad861"}, {file = "uvloop-0.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e5f2e2ff51aefe6c19ee98af12b4ae61f5be456cd24396953244a30880ad861"},
{file = "uvloop-0.16.0.tar.gz", hash = "sha256:f74bc20c7b67d1c27c72601c78cf95be99d5c2cdd4514502b4f3eb0933ff1228"}, {file = "uvloop-0.16.0.tar.gz", hash = "sha256:f74bc20c7b67d1c27c72601c78cf95be99d5c2cdd4514502b4f3eb0933ff1228"},
] ]
validators = [
{file = "validators-0.18.2-py3-none-any.whl", hash = "sha256:0143dcca8a386498edaf5780cbd5960da1a4c85e0719f3ee5c9b41249c4fefbd"},
{file = "validators-0.18.2.tar.gz", hash = "sha256:37cd9a9213278538ad09b5b9f9134266e7c226ab1fede1d500e29e0a8fbb9ea6"},
]
w3lib = [ w3lib = [
{file = "w3lib-1.22.0-py2.py3-none-any.whl", hash = "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53"}, {file = "w3lib-1.22.0-py2.py3-none-any.whl", hash = "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53"},
{file = "w3lib-1.22.0.tar.gz", hash = "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df"}, {file = "w3lib-1.22.0.tar.gz", hash = "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df"},

View File

@ -23,7 +23,6 @@ python-slugify = "^4.0.1"
requests = "^2.25.1" requests = "^2.25.1"
PyYAML = "^5.3.1" PyYAML = "^5.3.1"
extruct = "^0.12.0" extruct = "^0.12.0"
scrape-schema-recipe = "^0.1.3"
python-multipart = "^0.0.5" python-multipart = "^0.0.5"
fastapi-camelcase = "^1.0.2" fastapi-camelcase = "^1.0.2"
bcrypt = "^3.2.0" bcrypt = "^3.2.0"