#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2011, gagsays ' ''' nationalgeographic.com ''' from calibre.web.feeds.news import BasicNewsRecipe class NatGeo(BasicNewsRecipe): title = u'National Geographic' description = 'Daily news articles from The National Geographic' language = 'en' oldest_article = 20 max_articles_per_feed = 25 encoding = 'utf8' publisher = 'nationalgeographic.com' category = 'science, nat geo' __author__ = 'gagsays' masthead_url = 'http://s.ngeo.com/wpf/sites/themes/global/i/presentation/ng_logo_small.png' description = 'Inspiring people to care about the planet since 1888' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True use_embedded_content = False extra_css = ''' body {color: #000000;font-size: medium;} h1 {color: #222222; font-size: large; font-weight:lighter; text-decoration:none; text-align: center;font-family:Georgia,Times New Roman,Times,serif;} h2 {color: #454545; font-size: small; font-weight:lighter; text-decoration:none; text-align: justify; font-style:italic;font-family :Georgia,Times New Roman,Times,serif;} h3 {color: #555555; font-size: small; font-style:italic; margin-top: 10px;} img{margin-bottom: 0.25em;display:block;margin-left: auto;margin-right: auto;} a:link,a,.a,href {text-decoration: none;color: #000000;} .caption{color: #000000;font-size: xx-small;text-align: justify;font-weight:normal;} .credit{color: #555555;font-size: xx-small;text-align: left;font-weight:lighter;} p.author,p.publication{color: #000000;font-size: xx-small;text-align: left;display:inline;} p.publication_time{color: #000000;font-size: xx-small;text-align: right;text-decoration: underline;} p {margin-bottom: 0;} p + p {text-indent: 1.5em;margin-top: 0;} .hidden{display:none;} #page_head{text-transform:uppercase;} ''' def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: for article in feed.articles[:]: if 'Presented' in article.title or 'Pictures' in article.title: feed.articles.remove(article) return feeds def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup remove_tags_before = dict(id='page_head') keep_only_tags = [ dict(name='div',attrs={'id':['page_head','content_mainA']}) ] remove_tags_after = [ dict(name='div',attrs={'class':['article_text','promo_collection']}) ] remove_tags = [ dict(name='div', attrs={'class':['aside','primary full_width']}) ,dict(name='div', attrs={'id':['header_search','navigation_mainB_wrap']}) ] feeds = [ (u'Daily News', u'http://feeds.nationalgeographic.com/ng/News/News_Main') ]