Fix #1012903 (Updated recipe for The Christian Science Monitor)

This commit is contained in:
Kovid Goyal 2012-06-14 09:07:21 +05:30
parent 219de5c4ea
commit 462945fd39

View File

@ -4,6 +4,7 @@ __copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
www.csmonitor.com www.csmonitor.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class CSMonitor(BasicNewsRecipe): class CSMonitor(BasicNewsRecipe):
@ -40,13 +41,15 @@ class CSMonitor(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['meta','link','iframe','object','embed']) dict(name=['meta','link','iframe','object','embed'])
,dict(attrs={'class':['podStoryRel','bottom-rel','hide']}) ,dict(attrs={'class':re.compile('(^|| )podStoryRel($|| )', re.DOTALL)})
,dict(attrs={'class':['bottom-rel','hide']})
,dict(attrs={'id':['pgallerycarousel_enlarge','pgallerycarousel_related']}) ,dict(attrs={'id':['pgallerycarousel_enlarge','pgallerycarousel_related']})
] ]
keep_only_tags = [ keep_only_tags = [
dict(name='h1', attrs={'class':'head'}) dict(name='h1', attrs={'class':'head'})
,dict(name='h2', attrs={'class':'subhead'}) ,dict(name='h2', attrs={'class':'subhead'})
,dict(attrs={'class':['sByline','podStoryGal','ui-body-header','sBody']}) ,dict(attrs={'class':['sByline','thePhoto','ui-body-header']})
,dict(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)})
] ]
remove_attributes=['xmlns:fb'] remove_attributes=['xmlns:fb']
@ -74,10 +77,10 @@ class CSMonitor(BasicNewsRecipe):
if nexttag: if nexttag:
nurl = 'http://www.csmonitor.com' + nexttag['href'] nurl = 'http://www.csmonitor.com' + nexttag['href']
soup2 = self.index_to_soup(nurl) soup2 = self.index_to_soup(nurl)
texttag = soup2.find(attrs={'class':'sBody'}) texttag = soup2.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)})
if texttag: if texttag:
appendtag = soup.find(attrs={'class':'sBody'}) appendtag = soup.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)})
for citem in texttag.findAll(attrs={'class':['podStoryRel','bottom-rel','hide']}): for citem in texttag.findAll(attrs={'class':[re.compile('(^|| )podStoryRel($|| )', re.DOTALL),'bottom-rel','hide']}):
citem.extract() citem.extract()
self.append_page(soup2) self.append_page(soup2)
texttag.extract() texttag.extract()