Fix incorrect soup usage in various recipes

Also make SoupStrainer available in calibre.ebooks.BeautifulSoup
This commit is contained in:
Kovid Goyal 2019-03-25 10:17:27 +05:30
parent de9d97d688
commit ba59ac679d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 10 additions and 12 deletions

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import re import re
class AdvancedUserRecipe1283848012(BasicNewsRecipe): class AdvancedUserRecipe1283848012(BasicNewsRecipe):

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import re import re
class AdvancedUserRecipe1283848012(BasicNewsRecipe): class AdvancedUserRecipe1283848012(BasicNewsRecipe):

View File

@ -3,7 +3,6 @@ from __future__ import print_function
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
from calibre.utils.magick import Image from calibre.utils.magick import Image
from calibre.ebooks.BeautifulSoup import BeautifulSoup
''' Version 1.2, updated cover image to match the changed website. ''' Version 1.2, updated cover image to match the changed website.
added info date on title added info date on title
@ -163,7 +162,7 @@ class MerryExtract():
return killingSoup return killingSoup
class MerryProcess(BeautifulSoup): class MerryProcess(object):
myKiller = MerryExtract() myKiller = MerryExtract()
myPrepare = MerryPreProcess() myPrepare = MerryPreProcess()

View File

@ -1,7 +1,6 @@
import re import re
import urllib2 import urllib2
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
class Ebert(BasicNewsRecipe): class Ebert(BasicNewsRecipe):
@ -78,8 +77,8 @@ class Ebert(BasicNewsRecipe):
description = match.group(2) description = match.group(2)
self.log(thislink) self.log(thislink)
soup = self.index_to_soup(thislink)
for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')): for link in soup.findAll('a', href=True):
thisurl = self.PREFIX + link['href'] thisurl = self.PREFIX + link['href']
thislinktext = self.tag_to_string(link) thislinktext = self.tag_to_string(link)
@ -91,7 +90,7 @@ class Ebert(BasicNewsRecipe):
if thistitle == '': if thistitle == '':
thistitle = 'Ebert Journal Post' thistitle = 'Ebert Journal Post'
""" r"""
pattern2 = r'AID=\/(.*?)\/' pattern2 = r'AID=\/(.*?)\/'
reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL) reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
match2 = reg2.search(thisurl) match2 = reg2.search(thisurl)

View File

@ -2,7 +2,6 @@ import re
import urllib2 import urllib2
import time import time
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
from calibre import strftime from calibre import strftime
''' '''
@ -94,8 +93,9 @@ class Ebert(BasicNewsRecipe):
description = match.group(2) description = match.group(2)
self.log(thislink) self.log(thislink)
soup = self.index_to_soup(thislink)
for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')): for link in soup.findAll('a', href=True):
thisurl = self.PREFIX + link['href'] thisurl = self.PREFIX + link['href']
thislinktext = self.tag_to_string(link) thislinktext = self.tag_to_string(link)

View File

@ -6,8 +6,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import bs4 import bs4
from bs4 import ( # noqa from bs4 import ( # noqa
CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag, CData, Comment, Declaration, NavigableString, ProcessingInstruction,
__version__ SoupStrainer, Tag, __version__
) )
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type