From ba59ac679db276baf4ed0f7e462633529fb791e3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Mar 2019 10:17:27 +0530 Subject: [PATCH] Fix incorrect soup usage in various recipes Also make SoupStrainer available in calibre.ebooks.BeautifulSoup --- recipes/calcalist.recipe | 2 +- recipes/globes_co_il.recipe | 2 +- recipes/metro_news_nl.recipe | 3 +-- recipes/roger_ebert.recipe | 7 +++---- recipes/roger_ebert_blog.recipe | 4 ++-- src/calibre/ebooks/BeautifulSoup.py | 4 ++-- 6 files changed, 10 insertions(+), 12 deletions(-) diff --git a/recipes/calcalist.recipe b/recipes/calcalist.recipe index 699d0e911b..271633fc73 100644 --- a/recipes/calcalist.recipe +++ b/recipes/calcalist.recipe @@ -1,5 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import re +import re class AdvancedUserRecipe1283848012(BasicNewsRecipe): diff --git a/recipes/globes_co_il.recipe b/recipes/globes_co_il.recipe index 2634dd50dd..48b7db4fed 100644 --- a/recipes/globes_co_il.recipe +++ b/recipes/globes_co_il.recipe @@ -1,5 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import re +import re class AdvancedUserRecipe1283848012(BasicNewsRecipe): diff --git a/recipes/metro_news_nl.recipe b/recipes/metro_news_nl.recipe index fa5c4fdd7a..8fb9351234 100644 --- a/recipes/metro_news_nl.recipe +++ b/recipes/metro_news_nl.recipe @@ -3,7 +3,6 @@ from __future__ import print_function from calibre.web.feeds.news import BasicNewsRecipe import re from calibre.utils.magick import Image -from calibre.ebooks.BeautifulSoup import BeautifulSoup ''' Version 1.2, updated cover image to match the changed website. added info date on title @@ -163,7 +162,7 @@ class MerryExtract(): return killingSoup -class MerryProcess(BeautifulSoup): +class MerryProcess(object): myKiller = MerryExtract() myPrepare = MerryPreProcess() diff --git a/recipes/roger_ebert.recipe b/recipes/roger_ebert.recipe index a3e68c3466..6fd357caea 100644 --- a/recipes/roger_ebert.recipe +++ b/recipes/roger_ebert.recipe @@ -1,7 +1,6 @@ import re import urllib2 from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer class Ebert(BasicNewsRecipe): @@ -78,8 +77,8 @@ class Ebert(BasicNewsRecipe): description = match.group(2) self.log(thislink) - - for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')): + soup = self.index_to_soup(thislink) + for link in soup.findAll('a', href=True): thisurl = self.PREFIX + link['href'] thislinktext = self.tag_to_string(link) @@ -91,7 +90,7 @@ class Ebert(BasicNewsRecipe): if thistitle == '': thistitle = 'Ebert Journal Post' - """ + r""" pattern2 = r'AID=\/(.*?)\/' reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL) match2 = reg2.search(thisurl) diff --git a/recipes/roger_ebert_blog.recipe b/recipes/roger_ebert_blog.recipe index 8679180e8a..3a50f91e77 100644 --- a/recipes/roger_ebert_blog.recipe +++ b/recipes/roger_ebert_blog.recipe @@ -2,7 +2,6 @@ import re import urllib2 import time from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer from calibre import strftime ''' @@ -94,8 +93,9 @@ class Ebert(BasicNewsRecipe): description = match.group(2) self.log(thislink) + soup = self.index_to_soup(thislink) - for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')): + for link in soup.findAll('a', href=True): thisurl = self.PREFIX + link['href'] thislinktext = self.tag_to_string(link) diff --git a/src/calibre/ebooks/BeautifulSoup.py b/src/calibre/ebooks/BeautifulSoup.py index 24e2dd4526..76d868379c 100644 --- a/src/calibre/ebooks/BeautifulSoup.py +++ b/src/calibre/ebooks/BeautifulSoup.py @@ -6,8 +6,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera import bs4 from bs4 import ( # noqa - CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag, - __version__ + CData, Comment, Declaration, NavigableString, ProcessingInstruction, + SoupStrainer, Tag, __version__ ) from polyglot.builtins import unicode_type