From fc42e81b6d0c66c10953d80392108048053180a7 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:03:25 +0530
Subject: [PATCH 1/3] Washington Post
Updated both wash_post and wash_post_print recipes to improve article formatting. Added support for displaying article labels, promo images, and list items.
---
recipes/wash_post.recipe | 45 ++++++++++++++++++++++++++++------
recipes/wash_post_print.recipe | 44 +++++++++++++++++++++++++++------
2 files changed, 74 insertions(+), 15 deletions(-)
diff --git a/recipes/wash_post.recipe b/recipes/wash_post.recipe
index 995e6bf5c2..49a629b6b8 100644
--- a/recipes/wash_post.recipe
+++ b/recipes/wash_post.recipe
@@ -1,7 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
-__license__ = 'GPL v3'
-__copyright__ = '2011, Darko Miletic '
'''
www.washingtonpost.com
'''
@@ -15,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class TheWashingtonPost(BasicNewsRecipe):
title = 'The Washington Post'
- __author__ = 'Darko Miletic, unkn0wn'
+ __author__ = 'unkn0wn'
description = (
'Leading source for news, video and opinion on politics, business, '
'world and national news, science, travel, entertainment and more. '
@@ -33,6 +31,7 @@ class TheWashingtonPost(BasicNewsRecipe):
use_embedded_content = False
language = 'en_US'
remove_empty_feeds = True
+ resolve_internal_links = True
ignore_duplicate_articles = {'url'}
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg'
publication_type = 'newspaper'
@@ -56,6 +55,7 @@ class TheWashingtonPost(BasicNewsRecipe):
.img { text-align:center; font-size:small; }
.auth { font-weight:bold; font-size:small; }
.time { font-size:small; color: #202020; }
+ .subt { font-style: italic; }
'''
def get_cover_url(self):
@@ -100,8 +100,22 @@ class TheWashingtonPost(BasicNewsRecipe):
data = json.loads(m[0].text)
data = data['props']['pageProps']['globalContent']
- title = '' + data['headlines']['basic'] + '
'
- subhead = '' + data['description'].get('basic', '') + '
'
+ text = data.get('label', {}).get('basic', {}).get('text', '')
+ label = f'{text}
' if text else ''
+ if data.get('headlines'):
+ title = '' + data['headlines']['basic'] + '
'
+ elif data.get('metadata'):
+ title = '' + data['metadata']['headlines']['basic'] + '
'
+ subhead = '' + data['description'].get('basic', '') + ''
+
+ promo_img = ''
+ if data.get('promo_items', {}).get('basic', {}).get('type', '') == 'image':
+ pi = data['promo_items']['basic']
+ promo_img = (
+ '

{}
'.format(
+ pi['url'], pi['credits_caption_display']
+ )
+ )
author = ''
if 'credits' in data:
@@ -124,14 +138,29 @@ class TheWashingtonPost(BasicNewsRecipe):
x['promo_image']['url'], x['description'].get('basic', '')
)
elif x['type'] == 'image':
- body += (
+ img_ = (
'
{}
'.format(
x['url'], x['credits_caption_display']
)
)
+ if img_ != promo_img:
+ body += img_
+ elif x['type'] == 'list':
+ body += ''
+ for li in x['items']:
+ if li.get('content', '') != '':
+ body += f'- {li["content"]}
'
+ body += '
'
return (
- '' + title + subhead + author + body + '
'
+ ''
+ + label
+ + title
+ + subhead
+ + promo_img
+ + author
+ + body
+ + '
'
)
def preprocess_html(self, soup):
@@ -139,6 +168,6 @@ class TheWashingtonPost(BasicNewsRecipe):
img['src'] = (
'https://www.washingtonpost.com/wp-apps/imrs.php?src='
+ img['src']
- + '&w=916'
+ + '&w=600'
)
return soup
diff --git a/recipes/wash_post_print.recipe b/recipes/wash_post_print.recipe
index 15e851dd38..6d15122032 100644
--- a/recipes/wash_post_print.recipe
+++ b/recipes/wash_post_print.recipe
@@ -30,12 +30,14 @@ class wapoprint(BasicNewsRecipe):
language = 'en_US'
remove_attributes = ['style', 'height', 'width']
publication_type = 'newspaper'
+ resolve_internal_links = True
ignore_duplicate_articles = {'title', 'url'}
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg'
extra_css = '''
.img { text-align:center; font-size:small; }
.auth { font-weight:bold; font-size:small; }
.time { font-size:small; color: #202020; }
+ .subt { font-style: italic; }
'''
def get_browser(self, *args, **kwargs):
@@ -85,8 +87,22 @@ class wapoprint(BasicNewsRecipe):
data = json.loads(m[0].text)
data = data['props']['pageProps']['globalContent']
- title = '' + data['headlines']['basic'] + '
'
- subhead = '' + data['description'].get('basic', '') + '
'
+ text = data.get('label', {}).get('basic', {}).get('text', '')
+ label = f'{text}
' if text else ''
+ if data.get('headlines'):
+ title = '' + data['headlines']['basic'] + '
'
+ elif data.get('metadata'):
+ title = '' + data['metadata']['headlines']['basic'] + '
'
+ subhead = '' + data['description'].get('basic', '') + ''
+
+ promo_img = ''
+ if data.get('promo_items', {}).get('basic', {}).get('type', '') == 'image':
+ pi = data['promo_items']['basic']
+ promo_img = (
+ '

{}
'.format(
+ pi['url'], pi['credits_caption_display']
+ )
+ )
author = ''
if 'credits' in data:
@@ -109,14 +125,29 @@ class wapoprint(BasicNewsRecipe):
x['promo_image']['url'], x['description'].get('basic', '')
)
elif x['type'] == 'image':
- body += (
+ img_ = (
'
{}
'.format(
x['url'], x['credits_caption_display']
)
)
+ if img_ != promo_img:
+ body += img_
+ elif x['type'] == 'list':
+ body += ''
+ for li in x['items']:
+ if li.get('content', '') != '':
+ body += f'- {li["content"]}
'
+ body += '
'
return (
- '' + title + subhead + author + body + '
'
+ ''
+ + label
+ + title
+ + subhead
+ + promo_img
+ + author
+ + body
+ + '
'
)
def preprocess_html(self, soup):
@@ -124,10 +155,9 @@ class wapoprint(BasicNewsRecipe):
img['src'] = (
'https://www.washingtonpost.com/wp-apps/imrs.php?src='
+ img['src']
- + '&w=916'
+ + '&w=600'
)
return soup
def populate_article_metadata(self, article, soup, first):
- article.summary = self.tag_to_string(soup.find('h3'))
- article.text_summary = self.tag_to_string(soup.find('h3'))
+ article.summary = article.text_summary = self.tag_to_string(soup.find('p', attrs={'class':'subt'}))
From d353c5afe6463ee061fce2a49b7b8495f665b3e0 Mon Sep 17 00:00:00 2001
From: unkn0wn <51942695+unkn0w7n@users.noreply.github.com>
Date: Thu, 24 Jul 2025 11:38:54 +0530
Subject: [PATCH 2/3] ...
---
recipes/wash_post.recipe | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/recipes/wash_post.recipe b/recipes/wash_post.recipe
index 49a629b6b8..09e06c1177 100644
--- a/recipes/wash_post.recipe
+++ b/recipes/wash_post.recipe
@@ -93,7 +93,11 @@ class TheWashingtonPost(BasicNewsRecipe):
('Commanders', 'http://feeds.washingtonpost.com/rss/sports/redskins'),
]
- def preprocess_raw_html(self, raw, *a):
+ def preprocess_raw_html(self, raw, url):
+ if '/interactive/' in url:
+ return ('' + root.xpath('//h1')[0].text + '
'
+ 'This article is supposed to be read in a browser.'
+ '')
root = parse(raw)
m = root.xpath('//script[@id="__NEXT_DATA__"]')
@@ -102,10 +106,7 @@ class TheWashingtonPost(BasicNewsRecipe):
text = data.get('label', {}).get('basic', {}).get('text', '')
label = f'{text}
' if text else ''
- if data.get('headlines'):
- title = '' + data['headlines']['basic'] + '
'
- elif data.get('metadata'):
- title = '' + data['metadata']['headlines']['basic'] + '
'
+ title = '' + data['headlines']['basic'] + '
'
subhead = '' + data['description'].get('basic', '') + ''
promo_img = ''
From b8cdf762ee56589a79470c36e669c7fde33c456b Mon Sep 17 00:00:00 2001
From: unkn0wn <51942695+unkn0w7n@users.noreply.github.com>
Date: Thu, 24 Jul 2025 11:39:32 +0530
Subject: [PATCH 3/3] ...
---
recipes/wash_post_print.recipe | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/recipes/wash_post_print.recipe b/recipes/wash_post_print.recipe
index 6d15122032..1e7a5aa9cd 100644
--- a/recipes/wash_post_print.recipe
+++ b/recipes/wash_post_print.recipe
@@ -80,7 +80,11 @@ class wapoprint(BasicNewsRecipe):
feeds.append((secname, articles))
return feeds
- def preprocess_raw_html(self, raw, *a):
+ def preprocess_raw_html(self, raw, url):
+ if '/interactive/' in url:
+ return ('
' + root.xpath('//h1')[0].text + '
'
+ 'This article is supposed to be read in a browser.'
+ '')
root = parse(raw)
m = root.xpath('//script[@id="__NEXT_DATA__"]')
@@ -89,10 +93,7 @@ class wapoprint(BasicNewsRecipe):
text = data.get('label', {}).get('basic', {}).get('text', '')
label = f'{text}
' if text else ''
- if data.get('headlines'):
- title = '' + data['headlines']['basic'] + '
'
- elif data.get('metadata'):
- title = '' + data['metadata']['headlines']['basic'] + '
'
+ title = '' + data['headlines']['basic'] + '
'
subhead = '' + data['description'].get('basic', '') + ''
promo_img = ''