Feature/improve error message on scrape (#476)

* add better feedback on failed scrape

* fix json download link

* add better recipe parser

* dump deps

* fix force open on mobile

* formatting

* rewrite scraper to use new library

* fix failing tests

* bookmarklet support

* bookmarklet instructions

* recipes changelog

Co-authored-by: hay-kot <hay-kot@pm.me>
This commit is contained in:
Hayden 2021-06-09 13:04:54 -08:00 committed by GitHub
parent 3702331630
commit a78fbea711
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 658 additions and 15582 deletions

View File

@ -26,6 +26,12 @@
## Features and Improvements ## Features and Improvements
### Highlights ### Highlights
- Recipe Parser
- Recipes can now be imported with a bookmarklet!
- Significant improvement in supported sites with the new [Recipe Scraper Library](https://github.com/hhursev/recipe-scrapers)
- UI Debugging now available at `/recipes/debugger`
- Better error messages on failure
- ⚠️ last_recipe.json is now depreciated
- Beta Support for Postgres! 🎉 See the getting started page for details - Beta Support for Postgres! 🎉 See the getting started page for details
- Recipe Features - Recipe Features
- Step Sections - Step Sections

View File

@ -3,6 +3,18 @@
## URL Import ## URL Import
Adding a recipe can be as easy as copying the recipe URL into mealie and letting the web scrapper try to pull down the information. Currently this scraper is implemented with [scrape-schema-recipe package](https://pypi.org/project/scrape-schema-recipe/). You may have mixed results on some websites, especially with blogs or non specific recipe websites. See the bulk import Option below for another a convenient way to add blog style recipes into Mealie. Adding a recipe can be as easy as copying the recipe URL into mealie and letting the web scrapper try to pull down the information. Currently this scraper is implemented with [scrape-schema-recipe package](https://pypi.org/project/scrape-schema-recipe/). You may have mixed results on some websites, especially with blogs or non specific recipe websites. See the bulk import Option below for another a convenient way to add blog style recipes into Mealie.
## Using Bookmarklets
You can use bookmarklets to generate a bookmark that will take your current location, and open a new tab that will try to import that URL into Mealie.
You can use a [bookmarklet generator site](https://caiorss.github.io/bookmarklet-maker/) and the code below to generate a bookmark for your site. Just change the `http://localhost:8080` to your sites web address and follow the instructions. Note that there is no trailing `/`.
```js
var url = document.URL ;
var mealie = "http://localhost:8080"
var dest = mealie + "/?recipe_import_url=" + url
window.open(dest, '_blank')
```
## Recipe Editor ## Recipe Editor
Recipes can be edited and created via the UI. This is done with both a form based approach where you have a UI to work with as well as with a in browser JSON Editor. The JSON editor allows you to easily copy and paste data from other sources. Recipes can be edited and created via the UI. This is done with both a form based approach where you have a UI to work with as well as with a in browser JSON Editor. The JSON editor allows you to easily copy and paste data from other sources.

15465
frontend/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,7 @@
"vue-i18n": "^8.24.1", "vue-i18n": "^8.24.1",
"vue-router": "^3.5.1", "vue-router": "^3.5.1",
"vuedraggable": "^2.24.3", "vuedraggable": "^2.24.3",
"vuetify": "^2.4.6", "vuetify": "^2.5.3",
"vuex": "^3.6.2", "vuex": "^3.6.2",
"vuex-persistedstate": "^4.0.0-beta.3" "vuex-persistedstate": "^4.0.0-beta.3"
}, },

View File

@ -12,6 +12,7 @@ const recipeURLs = {
allRecipesByCategory: prefix + "category", allRecipesByCategory: prefix + "category",
create: prefix + "create", create: prefix + "create",
createByURL: prefix + "create-url", createByURL: prefix + "create-url",
testParseURL: prefix + "test-scrape-url",
recipe: slug => prefix + slug, recipe: slug => prefix + slug,
update: slug => prefix + slug, update: slug => prefix + slug,
delete: slug => prefix + slug, delete: slug => prefix + slug,
@ -29,11 +30,8 @@ export const recipeAPI = {
* @returns {string} Recipe Slug * @returns {string} Recipe Slug
*/ */
async createByURL(recipeURL) { async createByURL(recipeURL) {
const response = await apiReq.post( const response = await apiReq.post(recipeURLs.createByURL, { url: recipeURL }, false, () =>
recipeURLs.createByURL, i18n.t("recipe.recipe-created")
{ url: recipeURL },
() => i18n.t("recipe.recipe-creation-failed"),
() => i18n.t("recipe.recipe-created")
); );
store.dispatch("requestRecentRecipes"); store.dispatch("requestRecentRecipes");
@ -186,4 +184,9 @@ export const recipeAPI = {
const response = await apiReq.delete(API_ROUTES.recipesSlugCommentsId(slug, id)); const response = await apiReq.delete(API_ROUTES.recipesSlugCommentsId(slug, id));
return response.data; return response.data;
}, },
async testScrapeURL(url) {
const response = await apiReq.post(recipeURLs.testParseURL, { url: url });
return response.data;
},
}; };

View File

@ -3,9 +3,7 @@
<v-dialog v-model="addRecipe" width="650" @click:outside="reset"> <v-dialog v-model="addRecipe" width="650" @click:outside="reset">
<v-card :loading="processing"> <v-card :loading="processing">
<v-app-bar dark color="primary mb-2"> <v-app-bar dark color="primary mb-2">
<v-icon large left v-if="!processing"> <v-icon large left v-if="!processing"> mdi-link </v-icon>
mdi-link
</v-icon>
<v-progress-circular v-else indeterminate color="white" large class="mr-2"> </v-progress-circular> <v-progress-circular v-else indeterminate color="white" large class="mr-2"> </v-progress-circular>
<v-toolbar-title class="headline"> <v-toolbar-title class="headline">
@ -28,19 +26,58 @@
persistent-hint persistent-hint
></v-text-field> ></v-text-field>
<v-alert v-if="error" color="red" outlined type="success"> <v-expand-transition>
{{ $t("new-recipe.error-message") }} <v-alert v-if="error" color="error" class="mt-6 white--text">
</v-alert> <v-card-title class="ma-0 pa-0">
<v-icon left color="white" x-large> mdi-robot </v-icon>
{{ $t("new-recipe.error-title") }}
</v-card-title>
<v-divider class="my-3 mx-2"></v-divider>
<p>
{{ $t("new-recipe.error-details") }}
</p>
<div class="d-flex row justify-space-around my-3 force-white">
<a
class="dark"
href="https://developers.google.com/search/docs/data-types/recipe"
target="_blank"
rel="noreferrer nofollow"
>
Google ld+json Info
</a>
<a href="https://github.com/hay-kot/mealie/issues" target="_blank" rel="noreferrer nofollow">
GitHub Issues
</a>
<a href="https://schema.org/Recipe" target="_blank" rel="noreferrer nofollow">
Recipe Markup Specification
</a>
</div>
<div class="d-flex justify-end">
<v-btn
white
outlined
:to="{ path: '/recipes/debugger', query: { test_url: recipeURL } }"
@click="addRecipe = false"
>
<v-icon> mdi-external-link </v-icon>
View Scraped Data
</v-btn>
</div>
</v-alert>
</v-expand-transition>
</v-card-text> </v-card-text>
<v-divider></v-divider> <v-divider></v-divider>
<v-card-actions> <v-card-actions>
<v-spacer></v-spacer>
<v-btn color="grey" text @click="reset"> <v-btn color="grey" text @click="reset">
<v-icon left> mdi-close </v-icon>
{{ $t("general.close") }} {{ $t("general.close") }}
</v-btn> </v-btn>
<v-btn color="success" text type="submit" :loading="processing"> <v-spacer></v-spacer>
<v-btn color="success" type="submit" :loading="processing">
<v-icon left> {{ $globals.icons.create }} </v-icon>
{{ $t("general.submit") }} {{ $t("general.submit") }}
</v-btn> </v-btn>
</v-card-actions> </v-card-actions>
@ -65,7 +102,6 @@
<script> <script>
import { api } from "@/api"; import { api } from "@/api";
export default { export default {
props: { props: {
absolute: { absolute: {
@ -77,14 +113,32 @@ export default {
error: false, error: false,
fab: false, fab: false,
addRecipe: false, addRecipe: false,
recipeURL: "",
processing: false, processing: false,
}; };
}, },
mounted() {
if (this.$route.query.recipe_import_url) {
this.addRecipe = true;
this.createRecipe();
}
},
computed: {
recipeURL: {
set(recipe_import_url) {
this.$router.replace({ query: { ...this.$route.query, recipe_import_url } });
},
get() {
return this.$route.query.recipe_import_url || "";
},
},
},
methods: { methods: {
async createRecipe() { async createRecipe() {
if (this.$refs.urlForm.validate()) { this.error = false;
if (this.$refs.urlForm === undefined || this.$refs.urlForm.validate()) {
this.processing = true; this.processing = true;
const response = await api.recipes.createByURL(this.recipeURL); const response = await api.recipes.createByURL(this.recipeURL);
this.processing = false; this.processing = false;
@ -106,11 +160,20 @@ export default {
this.processing = false; this.processing = false;
}, },
isValidWebUrl(url) { isValidWebUrl(url) {
let regEx = /^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,256}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)$/gm; let regEx =
/^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,256}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)$/gm;
return regEx.test(url) ? true : "Must be a Valid URL"; return regEx.test(url) ? true : "Must be a Valid URL";
}, },
bookmark() {
return `javascript:(function()%7Bvar url %3D document.URL %3B%0Avar mealie %3D "http%3A%2F%2Flocalhost%3A8080%2F%23"%0Avar dest %3D mealie %2B "%2F%3Frecipe_import_url%3D" %2B url%0Awindow.open(dest%2C '_blank')%7D)()%3B`;
},
}, },
}; };
</script> </script>
<style></style> <style>
.force-white > a {
color: white !important;
}
</style>

View File

@ -6,9 +6,7 @@
<v-list-item dense v-if="isLoggedIn" :to="`/user/${user.id}/favorites`"> <v-list-item dense v-if="isLoggedIn" :to="`/user/${user.id}/favorites`">
<v-list-item-icon> <v-list-item-icon>
<v-icon> <v-icon> mdi-heart </v-icon>
mdi-heart
</v-icon>
</v-list-item-icon> </v-list-item-icon>
<v-list-item-content> <v-list-item-content>
<v-list-item-title> Favorites </v-list-item-title> <v-list-item-title> Favorites </v-list-item-title>
@ -30,17 +28,13 @@
<v-list nav dense class="fixedBottom" v-if="!isMain"> <v-list nav dense class="fixedBottom" v-if="!isMain">
<v-list-item href="https://github.com/sponsors/hay-kot" target="_target"> <v-list-item href="https://github.com/sponsors/hay-kot" target="_target">
<v-list-item-icon> <v-list-item-icon>
<v-icon color="pink"> <v-icon color="pink"> mdi-heart </v-icon>
mdi-heart
</v-icon>
</v-list-item-icon> </v-list-item-icon>
<v-list-item-title> {{ $t("about.support") }} </v-list-item-title> <v-list-item-title> {{ $t("about.support") }} </v-list-item-title>
</v-list-item> </v-list-item>
<v-list-item to="/admin/about"> <v-list-item to="/admin/about">
<v-list-item-icon class="mr-3 pt-1"> <v-list-item-icon class="mr-3 pt-1">
<v-icon :color="newVersionAvailable ? 'red--text' : ''"> <v-icon :color="newVersionAvailable ? 'red--text' : ''"> mdi-information </v-icon>
mdi-information
</v-icon>
</v-list-item-icon> </v-list-item-icon>
<v-list-item-content> <v-list-item-content>
<v-list-item-title> <v-list-item-title>
@ -86,7 +80,8 @@ export default {
}, },
mounted() { mounted() {
this.getVersion(); this.getVersion();
this.resetView();
this.showSidebar = !this.isMobile;
}, },
watch: { watch: {
user() { user() {
@ -98,7 +93,6 @@ export default {
isMain() { isMain() {
const testVal = this.$route.path.split("/"); const testVal = this.$route.path.split("/");
if (testVal[1] === "recipe") this.closeSidebar(); if (testVal[1] === "recipe") this.closeSidebar();
else this.resetView();
return !(testVal[1] === "admin"); return !(testVal[1] === "admin");
}, },
@ -135,7 +129,7 @@ export default {
const pages = this.$store.getters.getCustomPages; const pages = this.$store.getters.getCustomPages;
if (pages.length > 0) { if (pages.length > 0) {
pages.sort((a, b) => a.position - b.position); pages.sort((a, b) => a.position - b.position);
return pages.map(x => ({ return pages.map((x) => ({
title: x.name, title: x.name,
to: `/pages/${x.slug}`, to: `/pages/${x.slug}`,
icon: this.$globals.icons.pages, icon: this.$globals.icons.pages,
@ -217,9 +211,7 @@ export default {
resetImage() { resetImage() {
this.hideImage == false; this.hideImage == false;
}, },
resetView() {
this.showSidebar = !this.isMobile;
},
toggleSidebar() { toggleSidebar() {
this.showSidebar = !this.showSidebar; this.showSidebar = !this.showSidebar;
}, },

View File

@ -179,7 +179,8 @@
}, },
"new-recipe": { "new-recipe": {
"bulk-add": "Bulk Add", "bulk-add": "Bulk Add",
"error-message": "Looks like there was an error parsing the URL. Check the log and debug/last_recipe.json to see what went wrong.", "error-title": "Looks Like We Couldn't Find Anything",
"error-details": "Only websites containing ld+json or microdata can be imported imported by Mealie. Most major recipe websites support this data structure. If your site cannot be imported but there is json data in the log, please submit a github issue with the URL and data.",
"from-url": "Import a Recipe", "from-url": "Import a Recipe",
"paste-in-your-recipe-data-each-line-will-be-treated-as-an-item-in-a-list": "Paste in your recipe data. Each line will be treated as an item in a list", "paste-in-your-recipe-data-each-line-will-be-treated-as-an-item-in-a-list": "Paste in your recipe data. Each line will be treated as an item in a list",
"recipe-url": "Recipe URL", "recipe-url": "Recipe URL",
@ -251,7 +252,6 @@
"total-time": "Total Time", "total-time": "Total Time",
"unable-to-delete-recipe": "Unable to Delete Recipe", "unable-to-delete-recipe": "Unable to Delete Recipe",
"view-recipe": "View Recipe" "view-recipe": "View Recipe"
}, },
"search": { "search": {
"and": "and", "and": "and",

View File

@ -0,0 +1,62 @@
<template>
<v-container>
<v-text-field v-model="testUrl" outlined single-line label="Recipe Url"> </v-text-field>
<div class="d-flex">
<v-btn class="mt-0 ml-auto" color="info" @click="getTestData">
<v-icon left> mdi-test-tube </v-icon>
Test Scrape
</v-btn>
</div>
<VJsoneditor class="mt-2" v-model="recipeJson" height="1500px" :options="jsonEditorOptions" />
</v-container>
</template>
<script>
import VJsoneditor from "v-jsoneditor";
import { api } from "@/api";
export default {
components: {
VJsoneditor,
},
data() {
return {
jsonEditorOptions: {
mode: "code",
search: false,
mainMenuBar: false,
},
recipeJson: {},
defaultMessage: { details: "site failed to return valid schema" },
};
},
mounted() {
if (this.$route.query.test_url) {
this.getTestData();
}
},
computed: {
testUrl: {
set(test_url) {
this.$router.replace({ query: { ...this.$route.query, test_url } });
},
get() {
return this.$route.query.test_url || "";
},
},
},
methods: {
async getTestData() {
const response = await api.recipes.testScrapeURL(this.testUrl).catch(() => {
this.recipeJson = this.defaultMessage;
});
if (response.length < 1) {
this.recipeJson = this.defaultMessage;
return;
}
this.recipeJson = response;
},
},
};
</script>

View File

@ -63,7 +63,7 @@ import RecipeViewer from "@/components/Recipe/RecipeViewer";
import PrintView from "@/components/Recipe/PrintView"; import PrintView from "@/components/Recipe/PrintView";
import RecipeEditor from "@/components/Recipe/RecipeEditor"; import RecipeEditor from "@/components/Recipe/RecipeEditor";
import RecipeTimeCard from "@/components/Recipe/RecipeTimeCard.vue"; import RecipeTimeCard from "@/components/Recipe/RecipeTimeCard.vue";
import EditorButtonRow from "@/components/Recipe/EditorButtonRow"; import EditorButtonRow from "@/components/Recipe/EditorButtonRow.vue";
import NoRecipe from "@/components/Fallbacks/NoRecipe"; import NoRecipe from "@/components/Fallbacks/NoRecipe";
import { user } from "@/mixins/user"; import { user } from "@/mixins/user";
import { router } from "@/routes"; import { router } from "@/routes";
@ -133,7 +133,7 @@ export default {
}, },
watch: { watch: {
$route: function() { $route: function () {
this.getRecipeDetails(); this.getRecipeDetails();
}, },
}, },

View File

@ -1,5 +1,6 @@
const ViewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ViewRecipe"); const ViewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ViewRecipe");
const NewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/NewRecipe"); const NewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/NewRecipe");
const ScraperDebugger = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ScraperDebugger");
const CustomPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CustomPage"); const CustomPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CustomPage");
const AllRecipes = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/AllRecipes"); const AllRecipes = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/AllRecipes");
const CategoryTagPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CategoryTagPage"); const CategoryTagPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CategoryTagPage");
@ -9,6 +10,7 @@ import { api } from "@/api";
export const recipeRoutes = [ export const recipeRoutes = [
// Recipes // Recipes
{ path: "/recipes/all", component: AllRecipes }, { path: "/recipes/all", component: AllRecipes },
{ path: "/recipes/debugger", component: ScraperDebugger },
{ path: "/user/:id/favorites", component: Favorites }, { path: "/user/:id/favorites", component: Favorites },
{ path: "/recipes/tag/:tag", component: CategoryTagPage }, { path: "/recipes/tag/:tag", component: CategoryTagPage },
{ path: "/recipes/tag", component: CategoryTagPage }, { path: "/recipes/tag", component: CategoryTagPage },

View File

@ -13,6 +13,7 @@ from mealie.services.events import create_recipe_event
from mealie.services.image.image import scrape_image, write_image from mealie.services.image.image import scrape_image, write_image
from mealie.services.recipe.media import check_assets, delete_assets from mealie.services.recipe.media import check_assets, delete_assets
from mealie.services.scraper.scraper import create_from_url from mealie.services.scraper.scraper import create_from_url
from scrape_schema_recipe import scrape_url
from slugify import slugify from slugify import slugify
from sqlalchemy.orm.session import Session from sqlalchemy.orm.session import Session
@ -41,6 +42,11 @@ def create_from_json(
return recipe.slug return recipe.slug
@router.post("/test-scrape-url", dependencies=[Depends(get_current_user)])
def test_parse_recipe_url(url: RecipeURLIn):
return scrape_url(url.url)
@router.post("/create-url", status_code=201, response_model=str) @router.post("/create-url", status_code=201, response_model=str)
def parse_recipe_url( def parse_recipe_url(
background_tasks: BackgroundTasks, background_tasks: BackgroundTasks,

View File

@ -1,15 +1,14 @@
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends, HTTPException, status
from mealie.routes.deps import validate_file_token from mealie.routes.deps import validate_file_token
from starlette.responses import FileResponse from starlette.responses import FileResponse
from fastapi import HTTPException, status
router = APIRouter(prefix="/api/utils", tags=["Utils"], include_in_schema=True) router = APIRouter(prefix="/api/utils", tags=["Utils"], include_in_schema=True)
@router.get("/download/{token}") @router.get("/download")
async def download_file(file_path: Optional[Path] = Depends(validate_file_token)): async def download_file(file_path: Optional[Path] = Depends(validate_file_token)):
"""Uses a file token obtained by an active user to retrieve a file from the operating """Uses a file token obtained by an active user to retrieve a file from the operating
system.""" system."""

View File

@ -42,6 +42,7 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path:
def scrape_image(image_url: str, slug: str) -> Path: def scrape_image(image_url: str, slug: str) -> Path:
logger.info(f"Image URL: {image_url}")
if isinstance(image_url, str): # Handles String Types if isinstance(image_url, str): # Handles String Types
image_url = image_url image_url = image_url
@ -64,7 +65,7 @@ def scrape_image(image_url: str, slug: str) -> Path:
if r.status_code == 200: if r.status_code == 200:
r.raw.decode_content = True r.raw.decode_content = True
logger.info(f"File Name Suffix {filename.suffix}")
write_image(slug, r.raw, filename.suffix) write_image(slug, r.raw, filename.suffix)
filename.unlink(missing_ok=True) filename.unlink(missing_ok=True)

View File

@ -39,6 +39,8 @@ def minify_image(image_file: Path, force=False) -> ImageSizes:
min_dest = image_file.parent.joinpath("min-original.webp") min_dest = image_file.parent.joinpath("min-original.webp")
tiny_dest = image_file.parent.joinpath("tiny-original.webp") tiny_dest = image_file.parent.joinpath("tiny-original.webp")
cleanup_images = False
if min_dest.exists() and tiny_dest.exists() and org_dest.exists() and not force: if min_dest.exists() and tiny_dest.exists() and org_dest.exists() and not force:
return return
try: try:

View File

@ -9,7 +9,7 @@ from mealie.db.database import db
from mealie.schema.migration import MigrationImport from mealie.schema.migration import MigrationImport
from mealie.schema.recipe import Recipe from mealie.schema.recipe import Recipe
from mealie.services.image import image from mealie.services.image import image
from mealie.services.scraper.cleaner import Cleaner from mealie.services.scraper import cleaner
from mealie.utils.unzip import unpack_zip from mealie.utils.unzip import unpack_zip
from pydantic import BaseModel from pydantic import BaseModel
@ -144,7 +144,7 @@ class MigrationBase(BaseModel):
"""Calls the rewrite_alias function and the Cleaner.clean function on a """Calls the rewrite_alias function and the Cleaner.clean function on a
dictionary and returns the result unpacked into a Recipe object""" dictionary and returns the result unpacked into a Recipe object"""
recipe_dict = self.rewrite_alias(recipe_dict) recipe_dict = self.rewrite_alias(recipe_dict)
recipe_dict = Cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None)) recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
return Recipe(**recipe_dict) return Recipe(**recipe_dict)

View File

@ -1,4 +1,5 @@
import html import html
import json
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import List from typing import List
@ -6,157 +7,157 @@ from typing import List
from slugify import slugify from slugify import slugify
class Cleaner: def clean(recipe_data: dict, url=None) -> dict:
"""A Namespace for utility function to clean recipe data extracted """Main entrypoint to clean a recipe extracted from the web
from a url and returns a dictionary that is ready for import into and format the data into an accectable format for the database
the database. Cleaner.clean is the main entrypoint
Args:
recipe_data (dict): raw recipe dicitonary
Returns:
dict: cleaned recipe dictionary
""" """
recipe_data["description"] = clean_string(recipe_data.get("description", ""))
@staticmethod # Times
def clean(recipe_data: dict, url=None) -> dict: recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
"""Main entrypoint to clean a recipe extracted from the web recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
and format the data into an accectable format for the database recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", []))
Args: recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield"))
recipe_data (dict): raw recipe dicitonary recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient"))
recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions"))
recipe_data["image"] = image(recipe_data.get("image"))
recipe_data["slug"] = slugify(recipe_data.get("name"))
recipe_data["orgURL"] = url
Returns: return recipe_data
dict: cleaned recipe dictionary
"""
recipe_data["description"] = Cleaner.html(recipe_data.get("description", ""))
# Times
recipe_data["prepTime"] = Cleaner.time(recipe_data.get("prepTime"))
recipe_data["performTime"] = Cleaner.time(recipe_data.get("performTime"))
recipe_data["totalTime"] = Cleaner.time(recipe_data.get("totalTime"))
recipe_data["recipeCategory"] = Cleaner.category(recipe_data.get("recipeCategory", []))
recipe_data["recipeYield"] = Cleaner.yield_amount(recipe_data.get("recipeYield")) def clean_string(text: str) -> str:
recipe_data["recipeIngredient"] = Cleaner.ingredient(recipe_data.get("recipeIngredient")) cleaned_text = html.unescape(text)
recipe_data["recipeInstructions"] = Cleaner.instructions(recipe_data.get("recipeInstructions")) cleaned_text = re.sub("<[^<]+?>", "", cleaned_text)
recipe_data["image"] = Cleaner.image(recipe_data.get("image")) cleaned_text = re.sub(" +", " ", cleaned_text)
recipe_data["slug"] = slugify(recipe_data.get("name")) cleaned_text = re.sub("</p>", "\n", cleaned_text)
recipe_data["orgURL"] = url cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text)
cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip()
return cleaned_text
return recipe_data
@staticmethod def category(category: str):
def category(category: str): if isinstance(category, str) and category != "":
if isinstance(category, str) and category != "": return [category]
return [category] else:
else: return []
return []
@staticmethod
def html(raw_html):
cleanr = re.compile("<.*?>")
return re.sub(cleanr, "", raw_html)
@staticmethod def clean_html(raw_html):
def image(image=None) -> str: cleanr = re.compile("<.*?>")
if not image: return re.sub(cleanr, "", raw_html)
return "no image"
if isinstance(image, list):
return image[0]
elif isinstance(image, dict):
return image["url"]
elif isinstance(image, str):
return image
else:
raise Exception(f"Unrecognised image URL format: {image}")
@staticmethod
def instructions(instructions) -> List[dict]:
if not instructions:
return []
if isinstance(instructions[0], list): def image(image=None) -> str:
instructions = instructions[0] if not image:
return "no image"
if isinstance(image, list):
return image[0]
elif isinstance(image, dict):
return image["url"]
elif isinstance(image, str):
return image
else:
raise Exception(f"Unrecognised image URL format: {image}")
# One long string split by (possibly multiple) new lines
if isinstance(instructions, str):
return [{"text": Cleaner._instruction(line)} for line in instructions.splitlines() if line]
# Plain strings in a list def instructions(instructions) -> List[dict]:
elif isinstance(instructions, list) and isinstance(instructions[0], str): try:
return [{"text": Cleaner._instruction(step)} for step in instructions] instructions = json.loads(instructions)
except Exception:
pass
# Dictionaries (let's assume it's a HowToStep) in a list if not instructions:
elif isinstance(instructions, list) and isinstance(instructions[0], dict): return []
# Try List of Dictionary without "@type" or "type"
if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
return [{"text": Cleaner._instruction(step["text"])} for step in instructions]
if isinstance(instructions, list) and isinstance(instructions[0], list):
instructions = instructions[0]
# One long string split by (possibly multiple) new lines
if isinstance(instructions, str):
return [{"text": _instruction(line)} for line in instructions.splitlines() if line]
# Plain strings in a list
elif isinstance(instructions, list) and isinstance(instructions[0], str):
return [{"text": _instruction(step)} for step in instructions]
# Dictionaries (let's assume it's a HowToStep) in a list
elif isinstance(instructions, list) and isinstance(instructions[0], dict):
# Try List of Dictionary without "@type" or "type"
if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
return [{"text": _instruction(step["text"])} for step in instructions]
try:
# If HowToStep is under HowToSection
sectionSteps = []
for step in instructions:
if step["@type"] == "HowToSection":
[sectionSteps.append(item) for item in step["itemListElement"]]
if len(sectionSteps) > 0:
return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"]
return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"]
except Exception as e:
print(e)
# Not "@type", try "type"
try: try:
# If HowToStep is under HowToSection
sectionSteps = []
for step in instructions:
if step["@type"] == "HowToSection":
[sectionSteps.append(item) for item in step["itemListElement"]]
if len(sectionSteps) > 0:
return [
{"text": Cleaner._instruction(step["text"])}
for step in sectionSteps
if step["@type"] == "HowToStep"
]
return [ return [
{"text": Cleaner._instruction(step["text"])} {"text": _instruction(step["properties"]["text"])}
for step in instructions for step in instructions
if step["@type"] == "HowToStep" if step["type"].find("HowToStep") > -1
] ]
except Exception as e: except Exception:
print(e) pass
# Not "@type", try "type"
try:
return [
{"text": Cleaner._instruction(step["properties"]["text"])}
for step in instructions
if step["type"].find("HowToStep") > -1
]
except Exception:
pass
else: else:
raise Exception(f"Unrecognised instruction format: {instructions}") raise Exception(f"Unrecognised instruction format: {instructions}")
@staticmethod
def _instruction(line) -> str:
clean_line = Cleaner.html(line.strip())
# Some sites erroneously escape their strings on multiple levels
while not clean_line == (clean_line := html.unescape(clean_line)):
pass
return clean_line
@staticmethod def _instruction(line) -> str:
def ingredient(ingredients: list) -> str: clean_line = clean_string(line.strip())
if ingredients: # Some sites erroneously escape their strings on multiple levels
return [Cleaner.html(html.unescape(ing)) for ing in ingredients] while not clean_line == (clean_line := clean_string(clean_line)):
else: pass
return [] return clean_line
@staticmethod
def yield_amount(yld) -> str:
if isinstance(yld, list):
return yld[-1]
else:
return yld
@staticmethod def ingredient(ingredients: list) -> str:
def time(time_entry): if ingredients:
if time_entry is None: return [clean_string(ing) for ing in ingredients]
return None else:
elif isinstance(time_entry, timedelta): return []
pretty_print_timedelta(time_entry)
elif isinstance(time_entry, datetime):
print(time_entry) def yield_amount(yld) -> str:
elif isinstance(time_entry, str): if isinstance(yld, list):
if re.match("PT.*H.*M", time_entry): return yld[-1]
time_delta_object = parse_duration(time_entry) else:
return pretty_print_timedelta(time_delta_object) return yld
else:
return str(time_entry)
def clean_time(time_entry):
if time_entry is None:
return None
elif isinstance(time_entry, timedelta):
pretty_print_timedelta(time_entry)
elif isinstance(time_entry, datetime):
print(time_entry)
elif isinstance(time_entry, str):
if re.match("PT.*H.*M", time_entry):
time_delta_object = parse_duration(time_entry)
return pretty_print_timedelta(time_delta_object)
else:
return str(time_entry)
# ! TODO: Cleanup Code Below # ! TODO: Cleanup Code Below

View File

@ -1,18 +1,20 @@
import json import json
from typing import List from enum import Enum
from typing import Any, Callable
import requests import requests
import scrape_schema_recipe from fastapi import HTTPException, status
from mealie.core import root_logger
from mealie.core.config import app_dirs from mealie.core.config import app_dirs
from mealie.schema.recipe import Recipe from mealie.core.root_logger import get_logger
from mealie.schema.recipe import Recipe, RecipeStep
from mealie.services.image.image import scrape_image from mealie.services.image.image import scrape_image
from mealie.services.scraper import open_graph from mealie.services.scraper import cleaner, open_graph
from mealie.services.scraper.cleaner import Cleaner from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json") LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
logger = root_logger.get_logger()
logger = get_logger()
def create_from_url(url: str) -> Recipe: def create_from_url(url: str) -> Recipe:
@ -25,52 +27,130 @@ def create_from_url(url: str) -> Recipe:
Returns: Returns:
Recipe: Recipe Object Recipe: Recipe Object
""" """
r = requests.get(url) new_recipe = scrape_from_url(url)
new_recipe = extract_recipe_from_html(r.text, url) logger.info(f"Image {new_recipe.image}")
new_recipe = Cleaner.clean(new_recipe, url) new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
new_recipe = download_image_for_recipe(new_recipe)
return Recipe(**new_recipe)
def extract_recipe_from_html(html: str, url: str) -> dict:
try:
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
dump_last_json(scraped_recipes)
if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url, python_objects=True)
except Exception as e:
print(e)
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html)
dump_last_json(scraped_recipes)
if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url)
if scraped_recipes:
new_recipe: dict = scraped_recipes[0]
logger.info(f"Recipe Scraped From Web: {new_recipe}")
if not new_recipe:
return "fail" # TODO: Return Better Error Here
new_recipe = Cleaner.clean(new_recipe, url)
else:
new_recipe = open_graph.basic_recipe_from_opengraph(html, url)
logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
return new_recipe return new_recipe
def download_image_for_recipe(recipe: dict) -> dict: class ParserErrors(str, Enum):
try: bad_recipe = "BAD_RECIPE_DATA"
img_path = scrape_image(recipe.get("image"), recipe.get("slug")) no_recipe_data = "NO_RECIPE_DATA"
recipe["image"] = img_path.name connection_error = "CONNECTION_ERROR"
except Exception:
recipe["image"] = "no image"
return recipe
def extract_open_graph_values(url) -> Recipe:
r = requests.get(url)
recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
return Recipe(**recipe)
def scrape_from_url(url: str) -> Recipe:
"""Entry function to generating are recipe obejct from a url
This will determine if a url can be parsed and raise an appropriate error keyword
This keyword is used on the frontend to reference a localized string to present on the UI.
Args:
url (str): String Representing the URL
Raises:
HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
Returns:
Recipe: Recipe Model
"""
try:
scraped_schema = scrape_me(url)
except (WebsiteNotImplementedError, AttributeError):
try:
scraped_schema = scrape_me(url, wild_mode=True)
except (NoSchemaFoundInWildMode, AttributeError):
recipe = extract_open_graph_values(url)
if recipe.name != "":
return recipe
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.bad_recipe.value})
except ConnectionError:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.connection_error.value})
try:
instruct = scraped_schema.instructions()
except Exception:
instruct = []
try:
ing = scraped_schema.ingredients()
except Exception:
ing = []
if not instruct and not ing:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.no_recipe_data.value})
else:
return clean_scraper(scraped_schema, url)
def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
def try_get_default(func_call: Callable, get_attr: str, default: Any, clean_func=None):
value = default
try:
value = func_call()
except Exception:
logger.error(f"Error parsing recipe func_call for '{get_attr}'")
if value == default:
try:
value = scraped_data.schema.data.get(get_attr)
except Exception:
logger.error(f"Error parsing recipe attribute '{get_attr}'")
if clean_func:
value = clean_func(value)
return value
def get_instructions() -> list[dict]:
instruction_as_text = try_get_default(
scraped_data.instructions, "recipeInstructions", ["No Instructions Found"]
)
logger.info(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
instruction_as_text = cleaner.instructions(instruction_as_text)
logger.info(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
try:
return [RecipeStep(title="", text=x.get("text")) for x in instruction_as_text]
except TypeError:
return []
return Recipe(
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
slug="",
image=try_get_default(scraped_data.image, "image", None),
description=try_get_default(None, "description", "", cleaner.clean_string),
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
recipe_instructions=get_instructions(),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
perform_time=try_get_default(None, "performTime", None, cleaner.clean_time),
org_url=url,
)
def download_image_for_recipe(slug, image_url) -> dict:
img_name = None
try:
img_path = scrape_image(image_url, slug)
img_name = img_path.name
except Exception as e:
logger.error(f"Error Scraping Image: {e}")
img_name = None
return img_name or "no image"
def dump_last_json(recipe_data: dict): def dump_last_json(recipe_data: dict):

19
poetry.lock generated
View File

@ -939,6 +939,19 @@ python-versions = "*"
[package.dependencies] [package.dependencies]
rdflib = ">=4.2.2" rdflib = ">=4.2.2"
[[package]]
name = "recipe-scrapers"
version = "13.2.7"
description = "Python package, scraping recipes from all over the internet"
category = "main"
optional = false
python-versions = ">=3.6"
[package.dependencies]
beautifulsoup4 = ">=4.6.0"
extruct = ">=0.8.0"
requests = ">=2.19.1"
[[package]] [[package]]
name = "regex" name = "regex"
version = "2021.4.4" version = "2021.4.4"
@ -1236,7 +1249,7 @@ python-versions = "*"
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "73bac73c62e64c90a29816dde9ef1d896e8ca0b4271e67cde6ca8cc56bd87efd" content-hash = "8a123b6b0cf37c1d4a66ea4f137f79bba79f373c7019af879e1b06fb5ded0ed4"
[metadata.files] [metadata.files]
aiofiles = [ aiofiles = [
@ -1893,6 +1906,10 @@ rdflib = [
rdflib-jsonld = [ rdflib-jsonld = [
{file = "rdflib-jsonld-0.5.0.tar.gz", hash = "sha256:4f7d55326405071c7bce9acf5484643bcb984eadb84a6503053367da207105ed"}, {file = "rdflib-jsonld-0.5.0.tar.gz", hash = "sha256:4f7d55326405071c7bce9acf5484643bcb984eadb84a6503053367da207105ed"},
] ]
recipe-scrapers = [
{file = "recipe_scrapers-13.2.7-py3-none-any.whl", hash = "sha256:e5b2a251bbba2ef319ce32a10c4073b23f483f0ee2db83da543204549b06dffe"},
{file = "recipe_scrapers-13.2.7.tar.gz", hash = "sha256:e03d20a5c39f9c3dcb0185be1b6480ac0a086900d6aacf1699c77fa090944901"},
]
regex = [ regex = [
{file = "regex-2021.4.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:619d71c59a78b84d7f18891fe914446d07edd48dc8328c8e149cbe0929b4e000"}, {file = "regex-2021.4.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:619d71c59a78b84d7f18891fe914446d07edd48dc8328c8e149cbe0929b4e000"},
{file = "regex-2021.4.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:47bf5bf60cf04d72bf6055ae5927a0bd9016096bf3d742fa50d9bf9f45aa0711"}, {file = "regex-2021.4.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:47bf5bf60cf04d72bf6055ae5927a0bd9016096bf3d742fa50d9bf9f45aa0711"},

View File

@ -33,6 +33,7 @@ lxml = "4.6.2"
Pillow = "^8.2.0" Pillow = "^8.2.0"
pathvalidate = "^2.4.1" pathvalidate = "^2.4.1"
apprise = "^0.9.2" apprise = "^0.9.2"
recipe-scrapers = "^13.2.7"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]

View File

@ -2,8 +2,8 @@ import json
import re import re
import pytest import pytest
from mealie.services.scraper.cleaner import Cleaner from mealie.services.scraper import cleaner
from mealie.services.scraper.scraper import extract_recipe_from_html from mealie.services.scraper.scraper import open_graph
from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45 # https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
@ -39,23 +39,23 @@ url_validation_regex = re.compile(
], ],
) )
def test_cleaner_clean(json_file, num_steps): def test_cleaner_clean(json_file, num_steps):
recipe_data = Cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file)))) recipe_data = cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
assert len(recipe_data["recipeInstructions"]) == num_steps assert len(recipe_data["recipeInstructions"]) == num_steps
def test_clean_category(): def test_clean_category():
assert Cleaner.category("my-category") == ["my-category"] assert cleaner.category("my-category") == ["my-category"]
def test_clean_html(): def test_clean_string():
assert Cleaner.html("<div>Hello World</div>") == "Hello World" assert cleaner.clean_string("<div>Hello World</div>") == "Hello World"
def test_clean_image(): def test_clean_image():
assert Cleaner.image(None) == "no image" assert cleaner.image(None) == "no image"
assert Cleaner.image("https://my.image/path/") == "https://my.image/path/" assert cleaner.image("https://my.image/path/") == "https://my.image/path/"
assert Cleaner.image({"url": "My URL!"}) == "My URL!" assert cleaner.image({"url": "My URL!"}) == "My URL!"
assert Cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -70,7 +70,7 @@ def test_clean_image():
], ],
) )
def test_cleaner_instructions(instructions): def test_cleaner_instructions(instructions):
assert Cleaner.instructions(instructions) == [ assert cleaner.instructions(instructions) == [
{"text": "A"}, {"text": "A"},
{"text": "B"}, {"text": "B"},
{"text": "C"}, {"text": "C"},
@ -80,20 +80,18 @@ def test_cleaner_instructions(instructions):
def test_html_with_recipe_data(): def test_html_with_recipe_data():
path = TEST_RAW_HTML.joinpath("healthy_pasta_bake_60759.html") path = TEST_RAW_HTML.joinpath("healthy_pasta_bake_60759.html")
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759" url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
recipe_data = extract_recipe_from_html(open(path, encoding="utf8").read(), url) recipe_data = open_graph.basic_recipe_from_opengraph(open(path, encoding="utf8").read(), url)
assert len(recipe_data["name"]) > 10 assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10 assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100 assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"]) assert url_validation_regex.match(recipe_data["image"])
assert len(recipe_data["recipeIngredient"]) == 13
assert len(recipe_data["recipeInstructions"]) == 4
def test_time_cleaner(): def test_time_cleaner():
my_time_delta = "PT2H30M" my_time_delta = "PT2H30M"
return_delta = Cleaner.time(my_time_delta) return_delta = cleaner.clean_time(my_time_delta)
assert return_delta == "2 Hours 30 Minutes" assert return_delta == "2 Hours 30 Minutes"

View File

@ -0,0 +1,62 @@
from dataclasses import dataclass
import pytest
from mealie.services.scraper import scraper
@dataclass
class RecipeSiteTestCase:
url: str
expected_slug: str
num_ingredients: int
num_steps: int
test_cases = [
RecipeSiteTestCase(
url="https://www.seriouseats.com/taiwanese-three-cup-chicken-san-bei-gi-recipe",
expected_slug="taiwanese-three-cup-chicken-san-bei-ji-recipe",
num_ingredients=10,
num_steps=3,
),
RecipeSiteTestCase(
url="https://www.rezeptwelt.de/backen-herzhaft-rezepte/schinken-kaese-waffeln-ohne-viel-schnickschnack/4j0bkiig-94d4d-106529-cfcd2-is97x2ml",
expected_slug="schinken-kase-waffeln-ohne-viel-schnickschnack",
num_ingredients=7,
num_steps=1, # Malformed JSON Data, can't parse steps just get one string
),
RecipeSiteTestCase(
url="https://cookpad.com/us/recipes/5544853-sous-vide-smoked-beef-ribs",
expected_slug="sous-vide-smoked-beef-ribs",
num_ingredients=7,
num_steps=12,
),
RecipeSiteTestCase(
url="https://www.greatbritishchefs.com/recipes/jam-roly-poly-recipe",
expected_slug="jam-roly-poly-with-custard",
num_ingredients=13,
num_steps=9,
),
RecipeSiteTestCase(
url="https://recipes.anovaculinary.com/recipe/sous-vide-shrimp",
expected_slug="sous-vide-shrimp",
num_ingredients=5,
num_steps=0,
),
RecipeSiteTestCase(
url="https://www.bonappetit.com/recipe/detroit-style-pepperoni-pizza",
expected_slug="detroit-style-pepperoni-pizza",
num_ingredients=8,
num_steps=5,
),
]
@pytest.mark.parametrize("recipe_test_data", test_cases)
def test_recipe_parser(recipe_test_data: RecipeSiteTestCase):
recipe = scraper.create_from_url(recipe_test_data.url)
assert recipe.slug == recipe_test_data.expected_slug
assert len(recipe.recipe_instructions) == recipe_test_data.num_steps
assert len(recipe.recipe_ingredient) == recipe_test_data.num_ingredients
assert recipe.org_url == recipe_test_data.url