Feature/Capture Scraper Improvement PRs (#749)

* capture #732 changes

* capture PR #733

* capture PR #736

* capture pr #745

Co-authored-by: Hayden <hay-kot@pm.me>
This commit is contained in:
Hayden 2021-10-19 15:55:45 -08:00 committed by GitHub
parent 58349bc439
commit 89da1a2654
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 543 additions and 475 deletions

View File

@ -4,7 +4,6 @@ from fastapi import Depends, File
from fastapi.datastructures import UploadFile from fastapi.datastructures import UploadFile
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from scrape_schema_recipe import scrape_url
from sqlalchemy.orm.session import Session from sqlalchemy.orm.session import Session
from starlette.responses import FileResponse from starlette.responses import FileResponse
@ -16,7 +15,7 @@ from mealie.routes.routers import UserAPIRouter
from mealie.schema.recipe import CreateRecipeByURL, Recipe, RecipeImageTypes from mealie.schema.recipe import CreateRecipeByURL, Recipe, RecipeImageTypes
from mealie.schema.recipe.recipe import CreateRecipe, RecipeSummary from mealie.schema.recipe.recipe import CreateRecipe, RecipeSummary
from mealie.services.recipe.recipe_service import RecipeService from mealie.services.recipe.recipe_service import RecipeService
from mealie.services.scraper.scraper import create_from_url from mealie.services.scraper.scraper import create_from_url, scrape_from_url
user_router = UserAPIRouter() user_router = UserAPIRouter()
logger = get_logger() logger = get_logger()
@ -44,8 +43,11 @@ def parse_recipe_url(url: CreateRecipeByURL, recipe_service: RecipeService = Dep
@user_router.post("/test-scrape-url") @user_router.post("/test-scrape-url")
def test_parse_recipe_url(url: CreateRecipeByURL): def test_parse_recipe_url(url: CreateRecipeByURL):
# TODO: Replace with more current implementation of testing schema # Debugger should produce the same result as the scraper sees before cleaning
return scrape_url(url.url) scraped_data = scrape_from_url(url.url)
if scraped_data:
return scraped_data.schema.data
return "recipe_scrapers was unable to scrape this URL"
@user_router.post("/create-from-zip", status_code=201) @user_router.post("/create-from-zip", status_code=201)

View File

@ -44,10 +44,24 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path:
def scrape_image(image_url: str, slug: str) -> Path: def scrape_image(image_url: str, slug: str) -> Path:
logger.info(f"Image URL: {image_url}") logger.info(f"Image URL: {image_url}")
if isinstance(image_url, str): # Handles String Types if isinstance(image_url, str): # Handles String Types
image_url = image_url pass
if isinstance(image_url, list): # Handles List Types if isinstance(image_url, list): # Handles List Types
image_url = image_url[0] # Multiple images have been defined in the schema - usually different resolutions
# Typically would be in smallest->biggest order, but can't be certain so test each.
# 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.'
all_image_requests = []
for url in image_url:
try:
r = requests.get(url, stream=True, headers={"User-Agent": ""})
except Exception:
logger.exception("Image {url} could not be requested")
continue
if r.status_code == 200:
all_image_requests.append((url, r))
image_url, _ = max(all_image_requests, key=lambda url_r: len(url_r[1].content), default=("", 0))
if isinstance(image_url, dict): # Handles Dictionary Types if isinstance(image_url, dict): # Handles Dictionary Types
for key in image_url: for key in image_url:
@ -70,6 +84,6 @@ def scrape_image(image_url: str, slug: str) -> Path:
filename.unlink(missing_ok=True) filename.unlink(missing_ok=True)
return slug return Path(slug)
return None return None

View File

@ -2,7 +2,7 @@ import html
import json import json
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import List from typing import List, Optional
from slugify import slugify from slugify import slugify
@ -43,9 +43,13 @@ def clean_string(text: str) -> str:
if isinstance(text, list): if isinstance(text, list):
text = text[0] text = text[0]
print(type(text))
if text == "" or text is None: if text == "" or text is None:
return "" return ""
print(text)
cleaned_text = html.unescape(text) cleaned_text = html.unescape(text)
cleaned_text = re.sub("<[^<]+?>", "", cleaned_text) cleaned_text = re.sub("<[^<]+?>", "", cleaned_text)
cleaned_text = re.sub(" +", " ", cleaned_text) cleaned_text = re.sub(" +", " ", cleaned_text)
@ -67,6 +71,40 @@ def clean_html(raw_html):
return re.sub(cleanr, "", raw_html) return re.sub(cleanr, "", raw_html)
def clean_nutrition(nutrition: Optional[dict]) -> dict[str, str]:
# Assumes that all units are supplied in grams, except sodium which may be in mg.
# Fn only expects a dict[str,str]. Other structures should not be parsed.
if not isinstance(nutrition, dict):
return {}
# Allow for commas as decimals (common in Europe)
# Compile once for efficiency
re_match_digits = re.compile(r"\d+([.,]\d+)?")
output_nutrition = {}
for key, val in nutrition.items():
# If the val contains digits matching the regex, add the first match to the output dict.
# Handle unexpected datastructures safely.
try:
if matched_digits := re_match_digits.search(val):
output_nutrition[key] = matched_digits.group(0)
except Exception:
continue
output_nutrition = {key: val.replace(",", ".") for key, val in output_nutrition.items()}
if "sodiumContent" in nutrition and "m" not in nutrition["sodiumContent"] and "g" in nutrition["sodiumContent"]:
# Sodium is in grams. Parse its value, multiple by 1k and return to string.
try:
output_nutrition["sodiumContent"] = str(float(output_nutrition["sodiumContent"]) * 1000)
except ValueError:
# Could not parse sodium content as float, so don't touch it.
pass
return output_nutrition
def image(image=None) -> str: def image(image=None) -> str:
if not image: if not image:
return "no image" return "no image"
@ -167,9 +205,11 @@ def clean_time(time_entry):
elif isinstance(time_entry, datetime): elif isinstance(time_entry, datetime):
print(time_entry) print(time_entry)
elif isinstance(time_entry, str): elif isinstance(time_entry, str):
if re.match("PT.*H.*M", time_entry): try:
time_delta_object = parse_duration(time_entry) time_delta_object = parse_duration(time_entry)
return pretty_print_timedelta(time_delta_object) return pretty_print_timedelta(time_delta_object)
except ValueError:
logger.error(f"Could not parse time_entry `{time_entry}`")
else: else:
return str(time_entry) return str(time_entry)
@ -184,48 +224,34 @@ def parse_duration(iso_duration):
Returns: Returns:
a datetime.timedelta instance a datetime.timedelta instance
""" """
m = re.match(r"^P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+(?:.\d+)?)S)?$", iso_duration) m = re.match(
r"^P((\d+)Y)?((\d+)M)?((?P<days>\d+)D)?"
r"T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+(?:\.\d+)?)S)?$",
iso_duration,
)
if m is None: if m is None:
raise ValueError("invalid ISO 8601 duration string") raise ValueError("invalid ISO 8601 duration string")
days = 0
hours = 0
minutes = 0
seconds = 0.0
# Years and months are not being utilized here, as there is not enough # Years and months are not being utilized here, as there is not enough
# information provided to determine which year and which month. # information provided to determine which year and which month.
# Python's time_delta class stores durations as days, seconds and # Python's time_delta class stores durations as days, seconds and
# microseconds internally, and therefore we'd have to # microseconds internally, and therefore we'd have to
# convert parsed years and months to specific number of days. # convert parsed years and months to specific number of days.
if m[3]: times = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0}
days = int(m[3]) for unit, _ in times.items():
if m[4]: if m.group(unit):
hours = int(m[4]) times[unit] = int(float(m.group(unit)))
if m[5]:
minutes = int(m[5])
if m[6]:
seconds = float(m[6])
return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) return timedelta(**times)
def pretty_print_timedelta(t, max_components=None, max_decimal_places=2): def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places=2):
""" """
Print a pretty string for a timedelta. Print a pretty string for a timedelta.
For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days, 4 hours, 48 minutes'. Setting max_components to e.g. 1 will change this to '2.2 days', where the For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days 4 Hours 48 Minutes'. Setting max_components to e.g. 1 will change this to '2.2 days', where the
number of decimal points can also be set. number of decimal points can also be set.
""" """
time_scales = [
timedelta(days=365),
timedelta(days=1),
timedelta(hours=1),
timedelta(minutes=1),
timedelta(seconds=1),
timedelta(microseconds=1000),
timedelta(microseconds=1),
]
time_scale_names_dict = { time_scale_names_dict = {
timedelta(days=365): "year", timedelta(days=365): "year",
timedelta(days=1): "day", timedelta(days=1): "day",
@ -236,9 +262,8 @@ def pretty_print_timedelta(t, max_components=None, max_decimal_places=2):
timedelta(microseconds=1): "microsecond", timedelta(microseconds=1): "microsecond",
} }
count = 0 count = 0
txt = "" out_list = []
first = True for scale, scale_name in time_scale_names_dict.items():
for scale in time_scales:
if t >= scale: if t >= scale:
count += 1 count += 1
n = t / scale if count == max_components else int(t / scale) n = t / scale if count == max_components else int(t / scale)
@ -247,15 +272,9 @@ def pretty_print_timedelta(t, max_components=None, max_decimal_places=2):
n_txt = str(round(n, max_decimal_places)) n_txt = str(round(n, max_decimal_places))
if n_txt[-2:] == ".0": if n_txt[-2:] == ".0":
n_txt = n_txt[:-2] n_txt = n_txt[:-2]
txt += "{}{} {}{}".format(
"" if first else " ",
n_txt,
time_scale_names_dict[scale],
"s" if n > 1 else "",
)
if first:
first = False
if len(txt) == 0: out_list.append(f"{n_txt} {scale_name}{'s' if n > 1 else ''}")
txt = "none"
return txt if out_list == []:
return "none"
return " ".join(out_list)

View File

@ -1,6 +1,5 @@
import json
from enum import Enum from enum import Enum
from typing import Any, Callable from typing import Any, Callable, Optional
from uuid import uuid4 from uuid import uuid4
import requests import requests
@ -8,17 +7,11 @@ from fastapi import HTTPException, status
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
from slugify import slugify from slugify import slugify
from mealie.core.config import get_app_dirs
app_dirs = get_app_dirs()
from mealie.core.root_logger import get_logger from mealie.core.root_logger import get_logger
from mealie.schema.recipe import Recipe, RecipeStep from mealie.schema.recipe import Recipe, RecipeStep
from mealie.services.image.image import scrape_image from mealie.services.image.image import scrape_image
from mealie.services.scraper import cleaner, open_graph from mealie.services.scraper import cleaner, open_graph
LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
logger = get_logger() logger = get_logger()
@ -32,7 +25,14 @@ def create_from_url(url: str) -> Recipe:
Returns: Returns:
Recipe: Recipe Object Recipe: Recipe Object
""" """
new_recipe = scrape_from_url(url) # Try the different scrapers in order.
if scraped_data := scrape_from_url(url):
new_recipe = clean_scraper(scraped_data, url)
elif og_dict := extract_open_graph_values(url):
new_recipe = Recipe(**og_dict)
else:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
logger.info(f"Image {new_recipe.image}") logger.info(f"Image {new_recipe.image}")
new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image) new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
@ -49,16 +49,17 @@ class ParserErrors(str, Enum):
CONNECTION_ERROR = "CONNECTION_ERROR" CONNECTION_ERROR = "CONNECTION_ERROR"
def extract_open_graph_values(url) -> Recipe: def extract_open_graph_values(url) -> Optional[dict]:
r = requests.get(url) r = requests.get(url)
recipe = open_graph.basic_recipe_from_opengraph(r.text, url) recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
if recipe.get("name", "") == "":
return Recipe(**recipe) return None
return recipe
def scrape_from_url(url: str) -> Recipe: def scrape_from_url(url: str):
"""Entry function to generating are recipe obejct from a url """Entry function to scrape a recipe from a url
This will determine if a url can be parsed and raise an appropriate error keyword This will determine if a url can be parsed and return None if not, to allow another parser to try.
This keyword is used on the frontend to reference a localized string to present on the UI. This keyword is used on the frontend to reference a localized string to present on the UI.
Args: Args:
@ -68,7 +69,7 @@ def scrape_from_url(url: str) -> Recipe:
HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
Returns: Returns:
Recipe: Recipe Model Optional[Scraped schema for cleaning]
""" """
try: try:
scraped_schema = scrape_me(url) scraped_schema = scrape_me(url)
@ -76,28 +77,26 @@ def scrape_from_url(url: str) -> Recipe:
try: try:
scraped_schema = scrape_me(url, wild_mode=True) scraped_schema = scrape_me(url, wild_mode=True)
except (NoSchemaFoundInWildMode, AttributeError): except (NoSchemaFoundInWildMode, AttributeError):
recipe = extract_open_graph_values(url) # Recipe_scraper was unable to extract a recipe.
if recipe.name != "": return None
return recipe
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
except ConnectionError: except ConnectionError:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value}) raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value})
# Check to see if the recipe is valid
try: try:
ingredients = scraped_schema.ingredients()
instruct = scraped_schema.instructions() instruct = scraped_schema.instructions()
except Exception: except Exception:
ingredients = []
instruct = [] instruct = []
try: if instruct and ingredients:
ing = scraped_schema.ingredients() return scraped_schema
except Exception:
ing = []
if not instruct and not ing: # recipe_scrapers did not get a valid recipe.
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value}) # Return None to let another scraper try.
else: return None
return clean_scraper(scraped_schema, url)
def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe: def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
@ -135,17 +134,22 @@ def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) ->
except TypeError: except TypeError:
return [] return []
cook_time = try_get_default(None, "performTime", None, cleaner.clean_time) or try_get_default(
None, "cookTime", None, cleaner.clean_time
)
return Recipe( return Recipe(
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string), name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
slug="", slug="",
image=try_get_default(scraped_data.image, "image", None), image=try_get_default(None, "image", None),
description=try_get_default(None, "description", "", cleaner.clean_string), description=try_get_default(None, "description", "", cleaner.clean_string),
nutrition=try_get_default(None, "nutrition", None, cleaner.clean_nutrition),
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient), recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
recipe_instructions=get_instructions(), recipe_instructions=get_instructions(),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time), total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time), prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
perform_time=try_get_default(None, "performTime", None, cleaner.clean_time), perform_time=cook_time,
org_url=url, org_url=url,
) )
@ -160,10 +164,3 @@ def download_image_for_recipe(slug, image_url) -> dict:
img_name = None img_name = None
return img_name or "no image" return img_name or "no image"
def dump_last_json(recipe_data: dict):
with open(LAST_JSON, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
return

765
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -23,7 +23,6 @@ python-slugify = "^4.0.1"
requests = "^2.25.1" requests = "^2.25.1"
PyYAML = "^5.3.1" PyYAML = "^5.3.1"
extruct = "^0.12.0" extruct = "^0.12.0"
scrape-schema-recipe = "^0.1.3"
python-multipart = "^0.0.5" python-multipart = "^0.0.5"
fastapi-camelcase = "^1.0.2" fastapi-camelcase = "^1.0.2"
bcrypt = "^3.2.0" bcrypt = "^3.2.0"
@ -50,6 +49,7 @@ coverage = "^5.5"
pydantic-to-typescript = "^1.0.7" pydantic-to-typescript = "^1.0.7"
rich = "^10.7.0" rich = "^10.7.0"
isort = "^5.9.3" isort = "^5.9.3"
regex = "2021.9.30" # TODO: Remove during Upgrade -> https://github.com/psf/black/issues/2524
[build-system] [build-system]
requires = ["poetry-core>=1.0.0"] requires = ["poetry-core>=1.0.0"]

View File

@ -1,5 +1,6 @@
import json import json
import re import re
from datetime import timedelta
import pytest import pytest
@ -59,6 +60,24 @@ def test_clean_image():
assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
@pytest.mark.parametrize(
"nutrition,expected",
[
(None, {}),
({"calories": "105 kcal"}, {"calories": "105"}),
({"calories": "105 kcal 104 sugar"}, {"calories": "105"}),
({"calories": ""}, {}),
({"calories": ["not just a string"], "sugarContent": "but still tries 555.321"}, {"sugarContent": "555.321"}),
({"sodiumContent": "5.1235g"}, {"sodiumContent": "5123.5"}),
({"sodiumContent": "5mg"}, {"sodiumContent": "5"}),
({"sodiumContent": "10oz"}, {"sodiumContent": "10"}),
({"sodiumContent": "10.1.2g"}, {"sodiumContent": "10100.0"}),
],
)
def test_clean_nutrition(nutrition, expected):
assert cleaner.clean_nutrition(nutrition) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
"instructions", "instructions",
[ [
@ -90,9 +109,29 @@ def test_html_with_recipe_data():
assert url_validation_regex.match(recipe_data["image"]) assert url_validation_regex.match(recipe_data["image"])
def test_time_cleaner(): @pytest.mark.parametrize(
"time_delta,expected",
[
("PT2H30M", "2 Hours 30 Minutes"),
("PT30M", "30 Minutes"),
("PT3H", "3 Hours"),
("P1DT1H1M1S", "1 day 1 Hour 1 Minute 1 Second"),
("P1DT1H1M1.53S", "1 day 1 Hour 1 Minute 1 Second"),
("PT-3H", None),
("PT", "none"),
],
)
def test_time_cleaner(time_delta, expected):
assert cleaner.clean_time(time_delta) == expected
my_time_delta = "PT2H30M"
return_delta = cleaner.clean_time(my_time_delta)
assert return_delta == "2 Hours 30 Minutes" @pytest.mark.parametrize(
"t,max_components,max_decimal_places,expected",
[
(timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"),
(timedelta(days=2, seconds=17280), 1, 2, "2.2 days"),
(timedelta(days=365), None, 2, "1 year"),
],
)
def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected):
assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected