__license__ = 'GPL v3' __copyright__ = '2012, Darko Miletic ' ''' www.csmonitor.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe class CSMonitor(BasicNewsRecipe): title = 'The Christian Science Monitor - daily' __author__ = 'Darko Miletic' description = 'The Christian Science Monitor is an international news organization that delivers thoughtful, global coverage via its website, weekly magazine, daily news briefing, and email newsletters.' publisher = 'The Christian Science Monitor' category = 'news, politics, USA' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'en' remove_empty_feeds = True publication_type = 'newspaper' masthead_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif' extra_css = """ body{font-family: Arial,Tahoma,Verdana,Helvetica,sans-serif } img{margin-bottom: 0.4em; display:block} .head {font-family: Georgia,"Times New Roman",Times,serif} .sByline,.caption{font-size: x-small} .hide{display: none} .sLoc{font-weight: bold} ul{list-style-type: none} """ conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } remove_tags = [ dict(name=['meta','link','iframe','object','embed']) ,dict(attrs={'class':re.compile('(^|| )podStoryRel($|| )', re.DOTALL)}) ,dict(attrs={'class':['bottom-rel','hide']}) ,dict(attrs={'id':['pgallerycarousel_enlarge','pgallerycarousel_related']}) ] keep_only_tags = [ dict(name='h1', attrs={'class':'head'}) ,dict(name='h2', attrs={'class':'subhead'}) ,dict(attrs={'class':['sByline','thePhoto','ui-body-header']}) ,dict(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)}) ] remove_attributes=['xmlns:fb'] feeds = [ (u'USA' , u'http://rss.csmonitor.com/feeds/usa' ) ,(u'World' , u'http://rss.csmonitor.com/feeds/world' ) ,(u'Politics' , u'http://rss.csmonitor.com/feeds/politics' ) ,(u'Business' , u'http://rss.csmonitor.com/feeds/wam' ) ,(u'Commentary' , u'http://rss.csmonitor.com/feeds/commentary' ) ,(u'Books' , u'http://rss.csmonitor.com/feeds/books' ) ,(u'Arts' , u'http://rss.csmonitor.com/feeds/arts' ) ,(u'Environment' , u'http://rss.csmonitor.com/feeds/environment') ,(u'Innovation' , u'http://rss.csmonitor.com/feeds/scitech' ) ,(u'Living' , u'http://rss.csmonitor.com/feeds/living' ) ,(u'Science' , u'http://rss.csmonitor.com/feeds/science' ) ,(u'The Culture' , u'http://rss.csmonitor.com/feeds/theculture' ) ,(u'The Home Forum', u'http://rss.csmonitor.com/feeds/homeforum' ) ,(u'Articles' , u'http://rss.csmonitor.com/feeds/csarticles' ) ] def append_page(self, soup): pager = soup.find('div', attrs={'class':'navigation'}) if pager: nexttag = pager.find(attrs={'id':'next-button'}) if nexttag: nurl = 'http://www.csmonitor.com' + nexttag['href'] soup2 = self.index_to_soup(nurl) texttag = soup2.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)}) if texttag: appendtag = soup.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)}) for citem in texttag.findAll(attrs={'class':[re.compile('(^|| )podStoryRel($|| )', re.DOTALL),'bottom-rel','hide']}): citem.extract() self.append_page(soup2) texttag.extract() pager.extract() appendtag.append(texttag) def preprocess_html(self, soup): self.append_page(soup) pager = soup.find('div', attrs={'class':'navigation'}) if pager: pager.extract() for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img'): if 'scorecardresearch' in item['src']: item.extract() else: if not item.has_key('alt'): item['alt'] = 'image' return soup