diff --git a/recipes/wash_post.recipe b/recipes/wash_post.recipe
index 995e6bf5c2..09e06c1177 100644
--- a/recipes/wash_post.recipe
+++ b/recipes/wash_post.recipe
@@ -1,7 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
-__license__ = 'GPL v3'
-__copyright__ = '2011, Darko Miletic '
'''
www.washingtonpost.com
'''
@@ -15,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class TheWashingtonPost(BasicNewsRecipe):
title = 'The Washington Post'
- __author__ = 'Darko Miletic, unkn0wn'
+ __author__ = 'unkn0wn'
description = (
'Leading source for news, video and opinion on politics, business, '
'world and national news, science, travel, entertainment and more. '
@@ -33,6 +31,7 @@ class TheWashingtonPost(BasicNewsRecipe):
use_embedded_content = False
language = 'en_US'
remove_empty_feeds = True
+ resolve_internal_links = True
ignore_duplicate_articles = {'url'}
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg'
publication_type = 'newspaper'
@@ -56,6 +55,7 @@ class TheWashingtonPost(BasicNewsRecipe):
.img { text-align:center; font-size:small; }
.auth { font-weight:bold; font-size:small; }
.time { font-size:small; color: #202020; }
+ .subt { font-style: italic; }
'''
def get_cover_url(self):
@@ -93,15 +93,30 @@ class TheWashingtonPost(BasicNewsRecipe):
('Commanders', 'http://feeds.washingtonpost.com/rss/sports/redskins'),
]
- def preprocess_raw_html(self, raw, *a):
+ def preprocess_raw_html(self, raw, url):
+ if '/interactive/' in url:
+ return ('' + root.xpath('//h1')[0].text + '
'
+ 'This article is supposed to be read in a browser.'
+ '')
root = parse(raw)
m = root.xpath('//script[@id="__NEXT_DATA__"]')
data = json.loads(m[0].text)
data = data['props']['pageProps']['globalContent']
+ text = data.get('label', {}).get('basic', {}).get('text', '')
+ label = f'{text}
' if text else ''
title = '' + data['headlines']['basic'] + '
'
- subhead = '' + data['description'].get('basic', '') + '
'
+ subhead = '' + data['description'].get('basic', '') + ''
+
+ promo_img = ''
+ if data.get('promo_items', {}).get('basic', {}).get('type', '') == 'image':
+ pi = data['promo_items']['basic']
+ promo_img = (
+ '

{}
'.format(
+ pi['url'], pi['credits_caption_display']
+ )
+ )
author = ''
if 'credits' in data:
@@ -124,14 +139,29 @@ class TheWashingtonPost(BasicNewsRecipe):
x['promo_image']['url'], x['description'].get('basic', '')
)
elif x['type'] == 'image':
- body += (
+ img_ = (
'
{}
'.format(
x['url'], x['credits_caption_display']
)
)
+ if img_ != promo_img:
+ body += img_
+ elif x['type'] == 'list':
+ body += ''
+ for li in x['items']:
+ if li.get('content', '') != '':
+ body += f'- {li["content"]}
'
+ body += '
'
return (
- '' + title + subhead + author + body + '
'
+ ''
+ + label
+ + title
+ + subhead
+ + promo_img
+ + author
+ + body
+ + '
'
)
def preprocess_html(self, soup):
@@ -139,6 +169,6 @@ class TheWashingtonPost(BasicNewsRecipe):
img['src'] = (
'https://www.washingtonpost.com/wp-apps/imrs.php?src='
+ img['src']
- + '&w=916'
+ + '&w=600'
)
return soup
diff --git a/recipes/wash_post_print.recipe b/recipes/wash_post_print.recipe
index 15e851dd38..1e7a5aa9cd 100644
--- a/recipes/wash_post_print.recipe
+++ b/recipes/wash_post_print.recipe
@@ -30,12 +30,14 @@ class wapoprint(BasicNewsRecipe):
language = 'en_US'
remove_attributes = ['style', 'height', 'width']
publication_type = 'newspaper'
+ resolve_internal_links = True
ignore_duplicate_articles = {'title', 'url'}
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg'
extra_css = '''
.img { text-align:center; font-size:small; }
.auth { font-weight:bold; font-size:small; }
.time { font-size:small; color: #202020; }
+ .subt { font-style: italic; }
'''
def get_browser(self, *args, **kwargs):
@@ -78,15 +80,30 @@ class wapoprint(BasicNewsRecipe):
feeds.append((secname, articles))
return feeds
- def preprocess_raw_html(self, raw, *a):
+ def preprocess_raw_html(self, raw, url):
+ if '/interactive/' in url:
+ return ('' + root.xpath('//h1')[0].text + '
'
+ 'This article is supposed to be read in a browser.'
+ '')
root = parse(raw)
m = root.xpath('//script[@id="__NEXT_DATA__"]')
data = json.loads(m[0].text)
data = data['props']['pageProps']['globalContent']
+ text = data.get('label', {}).get('basic', {}).get('text', '')
+ label = f'{text}
' if text else ''
title = '' + data['headlines']['basic'] + '
'
- subhead = '' + data['description'].get('basic', '') + '
'
+ subhead = '' + data['description'].get('basic', '') + ''
+
+ promo_img = ''
+ if data.get('promo_items', {}).get('basic', {}).get('type', '') == 'image':
+ pi = data['promo_items']['basic']
+ promo_img = (
+ '

{}
'.format(
+ pi['url'], pi['credits_caption_display']
+ )
+ )
author = ''
if 'credits' in data:
@@ -109,14 +126,29 @@ class wapoprint(BasicNewsRecipe):
x['promo_image']['url'], x['description'].get('basic', '')
)
elif x['type'] == 'image':
- body += (
+ img_ = (
'
{}
'.format(
x['url'], x['credits_caption_display']
)
)
+ if img_ != promo_img:
+ body += img_
+ elif x['type'] == 'list':
+ body += ''
+ for li in x['items']:
+ if li.get('content', '') != '':
+ body += f'- {li["content"]}
'
+ body += '
'
return (
- '' + title + subhead + author + body + '
'
+ ''
+ + label
+ + title
+ + subhead
+ + promo_img
+ + author
+ + body
+ + '
'
)
def preprocess_html(self, soup):
@@ -124,10 +156,9 @@ class wapoprint(BasicNewsRecipe):
img['src'] = (
'https://www.washingtonpost.com/wp-apps/imrs.php?src='
+ img['src']
- + '&w=916'
+ + '&w=600'
)
return soup
def populate_article_metadata(self, article, soup, first):
- article.summary = self.tag_to_string(soup.find('h3'))
- article.text_summary = self.tag_to_string(soup.find('h3'))
+ article.summary = article.text_summary = self.tag_to_string(soup.find('p', attrs={'class':'subt'}))