#!/usr/bin/env python # vim:fileencoding=utf-8 ''' www.washingtonpost.com ''' import json from html5_parser import parse from calibre.web.feeds.news import BasicNewsRecipe class TheWashingtonPost(BasicNewsRecipe): title = 'The Washington Post' __author__ = 'unkn0wn' description = ( 'Leading source for news, video and opinion on politics, business, ' 'world and national news, science, travel, entertainment and more. ' 'Our local coverage includes reporting on education, crime, weather, ' 'traffic, real estate, jobs and cars for DC, Maryland and Virginia. ' 'Offering award-winning opinion writing, entertainment information ' 'and restaurant reviews.' ) publisher = 'The Washington Post Company' category = 'news, politics, USA' oldest_article = 1.2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'en_US' remove_empty_feeds = True resolve_internal_links = True ignore_duplicate_articles = {'url'} masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg' publication_type = 'newspaper' remove_attributes = ['style', 'width', 'height'] recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article), } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) extra_css = ''' .img { text-align:center; font-size:small; } .auth { font-weight:bold; font-size:small; } .time { font-size:small; color: #202020; } .subt { font-style: italic; } ''' def get_cover_url(self): soup = self.index_to_soup('https://www.washingtonpost.com/todays_paper/updates/') if img := soup.find( 'img', attrs={'src': lambda x: x and x.endswith('_FrontPage.png')} ): return img['src'] def get_browser(self, *args, **kwargs): kwargs['user_agent'] = ( 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' ) br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br.addheaders += [ ('Referer', 'https://www.google.com/'), ('X-Forwarded-For', '66.249.66.1'), ] return br # Official feeds: https://www.washingtonpost.com/discussions/2018/10/12/washington-post-rss-feeds/ feeds = [ ('Politics', 'http://feeds.washingtonpost.com/rss/politics'), ('Opinions', 'http://feeds.washingtonpost.com/rss/opinions'), ('Local', 'http://feeds.washingtonpost.com/rss/local'), ('Sports', 'http://feeds.washingtonpost.com/rss/sports'), ('Technology', 'http://feeds.washingtonpost.com/rss/business/technology'), ('National', 'http://feeds.washingtonpost.com/rss/national'), ('World', 'http://feeds.washingtonpost.com/rss/world'), ('Business', 'http://feeds.washingtonpost.com/rss/business'), ('Lifestyle', 'http://feeds.washingtonpost.com/rss/lifestyle'), ('Entertainment', 'http://feeds.washingtonpost.com/rss/entertainment'), # Undocumented feeds. ('White House', 'http://feeds.washingtonpost.com/rss/politics/whitehouse'), ('Commanders', 'http://feeds.washingtonpost.com/rss/sports/redskins'), ] def preprocess_raw_html(self, raw, url): root = parse(raw) if '/interactive/' in url: return ('
{text}
' if text else '' title = '' + data['description'].get('basic', '') + '' promo_img = '' if data.get('promo_items', {}).get('basic', {}).get('type', '') == 'image': pi = data['promo_items']['basic'] promo_img = ( '
' + x['content'] + '
' elif x['type'] == 'video': if 'promo_image' in x: body += '