Fix the recipe scraper debugger (#736)

* Fix recipe debugger

* Remove scrape-schema-recipe from dependencies

* Fix breaking tests
This commit is contained in:
cadamswaite 2021-10-20 01:01:35 +01:00 committed by GitHub
parent 75113cc2c7
commit 3831eef508
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 76 deletions

View File

@ -16,8 +16,7 @@ from mealie.schema.user import UserInDB
from mealie.services.events import create_recipe_event
from mealie.services.image.image import scrape_image, write_image
from mealie.services.recipe.media import check_assets, delete_assets
from mealie.services.scraper.scraper import create_from_url
from scrape_schema_recipe import scrape_url
from mealie.services.scraper.scraper import create_from_url, scrape_from_url
from slugify import slugify
from sqlalchemy.orm.session import Session
from starlette.responses import FileResponse
@ -50,7 +49,11 @@ def create_from_json(
@user_router.post("/test-scrape-url")
def test_parse_recipe_url(url: RecipeURLIn):
return scrape_url(url.url)
# Debugger should produce the same result as the scraper sees before cleaning
scraped_data = scrape_from_url(url.url)
if scraped_data:
return scraped_data.schema.data
return "recipe_scrapers was unable to scrape this URL"
@user_router.post("/create-url", status_code=201, response_model=str)

View File

@ -1,6 +1,6 @@
import json
from enum import Enum
from typing import Any, Callable
from typing import Any, Callable, Optional
from uuid import uuid4
import requests
@ -29,7 +29,14 @@ def create_from_url(url: str) -> Recipe:
Returns:
Recipe: Recipe Object
"""
new_recipe = scrape_from_url(url)
# Try the different scrapers in order.
if scraped_data := scrape_from_url(url):
new_recipe = clean_scraper(scraped_data, url)
elif og_dict := extract_open_graph_values(url):
new_recipe = Recipe(**og_dict)
else:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
logger.info(f"Image {new_recipe.image}")
new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
@ -46,16 +53,17 @@ class ParserErrors(str, Enum):
CONNECTION_ERROR = "CONNECTION_ERROR"
def extract_open_graph_values(url) -> Recipe:
def extract_open_graph_values(url) -> Optional[dict]:
r = requests.get(url)
recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
return Recipe(**recipe)
if recipe.get("name", "") == "":
return None
return recipe
def scrape_from_url(url: str) -> Recipe:
"""Entry function to generating are recipe obejct from a url
This will determine if a url can be parsed and raise an appropriate error keyword
def scrape_from_url(url: str):
"""Entry function to scrape a recipe from a url
This will determine if a url can be parsed and return None if not, to allow another parser to try.
This keyword is used on the frontend to reference a localized string to present on the UI.
Args:
@ -65,7 +73,7 @@ def scrape_from_url(url: str) -> Recipe:
HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
Returns:
Recipe: Recipe Model
Optional[Scraped schema for cleaning]
"""
try:
scraped_schema = scrape_me(url)
@ -73,28 +81,26 @@ def scrape_from_url(url: str) -> Recipe:
try:
scraped_schema = scrape_me(url, wild_mode=True)
except (NoSchemaFoundInWildMode, AttributeError):
recipe = extract_open_graph_values(url)
if recipe.name != "":
return recipe
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
# Recipe_scraper was unable to extract a recipe.
return None
except ConnectionError:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value})
# Check to see if the recipe is valid
try:
ingredients = scraped_schema.ingredients()
instruct = scraped_schema.instructions()
except Exception:
ingredients = []
instruct = []
try:
ing = scraped_schema.ingredients()
except Exception:
ing = []
if instruct and ingredients:
return scraped_schema
if not instruct and not ing:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value})
else:
return clean_scraper(scraped_schema, url)
# recipe_scrapers did not get a valid recipe.
# Return None to let another scraper try.
return None
def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:

60
poetry.lock generated
View File

@ -229,14 +229,6 @@ sdist = ["setuptools-rust (>=0.11.4)"]
ssh = ["bcrypt (>=3.1.5)"]
test = ["pytest (>=6.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"]
[[package]]
name = "decorator"
version = "5.1.0"
description = "Decorators for Humans"
category = "main"
optional = false
python-versions = ">=3.5"
[[package]]
name = "ecdsa"
version = "0.17.0"
@ -329,7 +321,7 @@ python-versions = "*"
python-dateutil = ">=2.8.1"
[package.extras]
dev = ["twine", "markdown", "flake8"]
dev = ["twine", "markdown", "flake8", "wheel"]
[[package]]
name = "greenlet"
@ -1045,20 +1037,6 @@ python-versions = ">=3.5, <4"
[package.dependencies]
pyasn1 = ">=0.1.3"
[[package]]
name = "scrape-schema-recipe"
version = "0.1.5"
description = "Extracts cooking recipe from HTML structured data in the https://schema.org/Recipe format."
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
extruct = "*"
isodate = ">=0.5.1"
requests = "*"
validators = ">=0.12.4"
[[package]]
name = "six"
version = "1.16.0"
@ -1208,21 +1186,6 @@ dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=3.6.0)", "Sphinx (>=4.1.2,<4.2.0
docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"]
test = ["aiohttp", "flake8 (>=3.9.2,<3.10.0)", "psutil", "pycodestyle (>=2.7.0,<2.8.0)", "pyOpenSSL (>=19.0.0,<19.1.0)", "mypy (>=0.800)"]
[[package]]
name = "validators"
version = "0.18.2"
description = "Python Data Validation for Humans™."
category = "main"
optional = false
python-versions = ">=3.4"
[package.dependencies]
decorator = ">=3.4.0"
six = ">=1.4.0"
[package.extras]
test = ["pytest (>=2.2.3)", "flake8 (>=2.4.0)", "isort (>=4.2.2)"]
[[package]]
name = "w3lib"
version = "1.22.0"
@ -1295,7 +1258,7 @@ pgsql = ["psycopg2-binary"]
[metadata]
lock-version = "1.1"
python-versions = "^3.9"
content-hash = "0cafb5f1d3d4ca7b1f27acc107bb1b2bc07d87b3f09e589351b8963ce7fed006"
content-hash = "2e7a7366808044d28f47c02bab618d21a3c92bb028e30b9f5ca5232b388f2696"
[metadata.files]
aiofiles = [
@ -1470,6 +1433,7 @@ cryptography = [
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085"},
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b"},
{file = "cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb"},
{file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d"},
{file = "cryptography-3.4.8-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89"},
{file = "cryptography-3.4.8-cp36-abi3-win32.whl", hash = "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7"},
{file = "cryptography-3.4.8-cp36-abi3-win_amd64.whl", hash = "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc"},
@ -1483,10 +1447,6 @@ cryptography = [
{file = "cryptography-3.4.8-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e"},
{file = "cryptography-3.4.8.tar.gz", hash = "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c"},
]
decorator = [
{file = "decorator-5.1.0-py3-none-any.whl", hash = "sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374"},
{file = "decorator-5.1.0.tar.gz", hash = "sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7"},
]
ecdsa = [
{file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"},
{file = "ecdsa-0.17.0.tar.gz", hash = "sha256:b9f500bb439e4153d0330610f5d26baaf18d17b8ced1bc54410d189385ea68aa"},
@ -1508,6 +1468,7 @@ flake8 = [
]
ghp-import = [
{file = "ghp-import-2.0.1.tar.gz", hash = "sha256:753de2eace6e0f7d4edfb3cce5e3c3b98cd52aadb80163303d1d036bda7b4483"},
{file = "ghp_import-2.0.1-py3-none-any.whl", hash = "sha256:8241a8e9f8dd3c1fafe9696e6e081b57a208ef907e9939c44e7415e407ab40ea"},
]
greenlet = [
{file = "greenlet-1.1.1-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:476ba9435afaead4382fbab8f1882f75e3fb2285c35c9285abb3dd30237f9142"},
@ -1856,6 +1817,11 @@ pluggy = [
]
psycopg2-binary = [
{file = "psycopg2-binary-2.9.1.tar.gz", hash = "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a"},
{file = "psycopg2_binary-2.9.1-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c"},
{file = "psycopg2_binary-2.9.1-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76"},
{file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698"},
{file = "psycopg2_binary-2.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616"},
@ -2097,10 +2063,6 @@ rsa = [
{file = "rsa-4.7.2-py3-none-any.whl", hash = "sha256:78f9a9bf4e7be0c5ded4583326e7461e3a3c5aae24073648b4bdfa797d78c9d2"},
{file = "rsa-4.7.2.tar.gz", hash = "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"},
]
scrape-schema-recipe = [
{file = "scrape-schema-recipe-0.1.5.tar.gz", hash = "sha256:aa49499389f045905a31426517c98a8a66e156b2744fe9cb62b636ffc4b3ce0f"},
{file = "scrape_schema_recipe-0.1.5-py2.py3-none-any.whl", hash = "sha256:ac9173a9f05711ba5bcbf98546c4c36e8bc76ba94827e3ddb2472a2b8052960c"},
]
six = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@ -2220,10 +2182,6 @@ uvloop = [
{file = "uvloop-0.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e5f2e2ff51aefe6c19ee98af12b4ae61f5be456cd24396953244a30880ad861"},
{file = "uvloop-0.16.0.tar.gz", hash = "sha256:f74bc20c7b67d1c27c72601c78cf95be99d5c2cdd4514502b4f3eb0933ff1228"},
]
validators = [
{file = "validators-0.18.2-py3-none-any.whl", hash = "sha256:0143dcca8a386498edaf5780cbd5960da1a4c85e0719f3ee5c9b41249c4fefbd"},
{file = "validators-0.18.2.tar.gz", hash = "sha256:37cd9a9213278538ad09b5b9f9134266e7c226ab1fede1d500e29e0a8fbb9ea6"},
]
w3lib = [
{file = "w3lib-1.22.0-py2.py3-none-any.whl", hash = "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53"},
{file = "w3lib-1.22.0.tar.gz", hash = "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df"},

View File

@ -23,7 +23,6 @@ python-slugify = "^4.0.1"
requests = "^2.25.1"
PyYAML = "^5.3.1"
extruct = "^0.12.0"
scrape-schema-recipe = "^0.1.3"
python-multipart = "^0.0.5"
fastapi-camelcase = "^1.0.2"
bcrypt = "^3.2.0"