From 462945fd395e678795b12edce217916cdb0831b9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 14 Jun 2012 09:07:21 +0530 Subject: [PATCH] Fix #1012903 (Updated recipe for The Christian Science Monitor) --- recipes/chr_mon.recipe | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index f2a11cc067..dd8b3e0b33 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -4,6 +4,7 @@ __copyright__ = '2012, Darko Miletic ' www.csmonitor.com ''' +import re from calibre.web.feeds.news import BasicNewsRecipe class CSMonitor(BasicNewsRecipe): @@ -40,13 +41,15 @@ class CSMonitor(BasicNewsRecipe): remove_tags = [ dict(name=['meta','link','iframe','object','embed']) - ,dict(attrs={'class':['podStoryRel','bottom-rel','hide']}) + ,dict(attrs={'class':re.compile('(^|| )podStoryRel($|| )', re.DOTALL)}) + ,dict(attrs={'class':['bottom-rel','hide']}) ,dict(attrs={'id':['pgallerycarousel_enlarge','pgallerycarousel_related']}) ] keep_only_tags = [ dict(name='h1', attrs={'class':'head'}) ,dict(name='h2', attrs={'class':'subhead'}) - ,dict(attrs={'class':['sByline','podStoryGal','ui-body-header','sBody']}) + ,dict(attrs={'class':['sByline','thePhoto','ui-body-header']}) + ,dict(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)}) ] remove_attributes=['xmlns:fb'] @@ -74,10 +77,10 @@ class CSMonitor(BasicNewsRecipe): if nexttag: nurl = 'http://www.csmonitor.com' + nexttag['href'] soup2 = self.index_to_soup(nurl) - texttag = soup2.find(attrs={'class':'sBody'}) + texttag = soup2.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)}) if texttag: - appendtag = soup.find(attrs={'class':'sBody'}) - for citem in texttag.findAll(attrs={'class':['podStoryRel','bottom-rel','hide']}): + appendtag = soup.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)}) + for citem in texttag.findAll(attrs={'class':[re.compile('(^|| )podStoryRel($|| )', re.DOTALL),'bottom-rel','hide']}): citem.extract() self.append_page(soup2) texttag.extract()