__license__ = 'GPL v3' __copyright__ = '2010-2011, Darko Miletic ' ''' nrc.nl ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class Pagina12(BasicNewsRecipe): title = 'NRC' __author__ = 'Darko Miletic' description = 'News from Netherlands' publisher = 'nrc.nl' category = 'news, politics, Netherlands' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'nl' country = 'NL' remove_empty_feeds = True masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png' keep_only_tags = [ dict(name=['h1', 'figure']), dict(attrs={'class': ['intro', 'byline']}), dict(attrs={'class': lambda x: x and 'article__content' in x}), ] remove_attributes = ['style'] feeds = ['http://www.nrc.nl/rss/'] def preprocess_html(self, soup): src = None for meta in soup.findAll('meta', itemprop='image', content=True): src = meta['content'] break if src is not None: div = soup.find( 'div', attrs={'class': lambda x: x and 'featured-img' in x}) if div is not None: img = new_tag(soup, 'img') img['src'] = src div.append(img) return soup