__license__ = 'GPL v3' __copyright__ = '2012, Darko Miletic ' ''' www.csmonitor.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe class CSMonitor(BasicNewsRecipe): title = 'The Christian Science Monitor - daily' __author__ = 'Darko Miletic' description = 'The Christian Science Monitor is an international news organization that delivers thoughtful, global coverage via its website, weekly magazine, daily news briefing, and email newsletters.' # noqa publisher = 'The Christian Science Monitor' category = 'news, politics, USA' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'en' remove_empty_feeds = True publication_type = 'newspaper' masthead_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif' extra_css = """ body{font-family: Arial,Tahoma,Verdana,Helvetica,sans-serif } img{margin-bottom: 0.4em; display:block} .head {font-family: Georgia,"Times New Roman",Times,serif} .sByline,.caption{font-size: x-small} .hide{display: none} .sLoc{font-weight: bold} ul{list-style-type: none} """ conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } remove_tags = [ dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile('(^|| )podStoryRel($|| )', re.DOTALL)}), dict( attrs={'class': ['bottom-rel', 'hide']}), dict(attrs={'id': ['pgallerycarousel_enlarge', 'pgallerycarousel_related']}) ] keep_only_tags = [ dict(name='h1', attrs={'class': 'head'}), dict(name='h2', attrs={'class': 'subhead'}), dict(attrs={'class': [ 'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)}) ] remove_attributes = ['xmlns:fb'] feeds = [ (u'USA', u'http://rss.csmonitor.com/feeds/usa'), (u'World', u'http://rss.csmonitor.com/feeds/world'), (u'Politics', u'http://rss.csmonitor.com/feeds/politics'), (u'Business', u'http://rss.csmonitor.com/feeds/wam'), (u'Commentary', u'http://rss.csmonitor.com/feeds/commentary'), (u'Books', u'http://rss.csmonitor.com/feeds/books'), (u'Arts', u'http://rss.csmonitor.com/feeds/arts'), (u'Environment', u'http://rss.csmonitor.com/feeds/environment'), (u'Innovation', u'http://rss.csmonitor.com/feeds/scitech'), (u'Living', u'http://rss.csmonitor.com/feeds/living'), (u'Science', u'http://rss.csmonitor.com/feeds/science'), (u'The Culture', u'http://rss.csmonitor.com/feeds/theculture'), (u'The Home Forum', u'http://rss.csmonitor.com/feeds/homeforum'), (u'Articles', u'http://rss.csmonitor.com/feeds/csarticles') ] def append_page(self, soup): pager = soup.find('div', attrs={'class': 'navigation'}) if pager: nexttag = pager.find(attrs={'id': 'next-button'}) if nexttag: nurl = 'http://www.csmonitor.com' + nexttag['href'] soup2 = self.index_to_soup(nurl) texttag = soup2.find( attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)}) if texttag: appendtag = soup.find( attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)}) for citem in texttag.findAll(attrs={'class': [re.compile('(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}): citem.extract() self.append_page(soup2) texttag.extract() pager.extract() appendtag.append(texttag) def preprocess_html(self, soup): self.append_page(soup) pager = soup.find('div', attrs={'class': 'navigation'}) if pager: pager.extract() for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img', src=True): if 'scorecardresearch' in item['src']: item.extract() return soup