diff --git a/recipes/wash_post.recipe b/recipes/wash_post.recipe index 995e6bf5c2..49a629b6b8 100644 --- a/recipes/wash_post.recipe +++ b/recipes/wash_post.recipe @@ -1,7 +1,5 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -__license__ = 'GPL v3' -__copyright__ = '2011, Darko Miletic ' ''' www.washingtonpost.com ''' @@ -15,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheWashingtonPost(BasicNewsRecipe): title = 'The Washington Post' - __author__ = 'Darko Miletic, unkn0wn' + __author__ = 'unkn0wn' description = ( 'Leading source for news, video and opinion on politics, business, ' 'world and national news, science, travel, entertainment and more. ' @@ -33,6 +31,7 @@ class TheWashingtonPost(BasicNewsRecipe): use_embedded_content = False language = 'en_US' remove_empty_feeds = True + resolve_internal_links = True ignore_duplicate_articles = {'url'} masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg' publication_type = 'newspaper' @@ -56,6 +55,7 @@ class TheWashingtonPost(BasicNewsRecipe): .img { text-align:center; font-size:small; } .auth { font-weight:bold; font-size:small; } .time { font-size:small; color: #202020; } + .subt { font-style: italic; } ''' def get_cover_url(self): @@ -100,8 +100,22 @@ class TheWashingtonPost(BasicNewsRecipe): data = json.loads(m[0].text) data = data['props']['pageProps']['globalContent'] - title = '

' + data['headlines']['basic'] + '

' - subhead = '

' + data['description'].get('basic', '') + '

' + text = data.get('label', {}).get('basic', {}).get('text', '') + label = f'

{text}

' if text else '' + if data.get('headlines'): + title = '

' + data['headlines']['basic'] + '

' + elif data.get('metadata'): + title = '

' + data['metadata']['headlines']['basic'] + '

' + subhead = '

' + data['description'].get('basic', '') + '' + + promo_img = '' + if data.get('promo_items', {}).get('basic', {}).get('type', '') == 'image': + pi = data['promo_items']['basic'] + promo_img = ( + '

{}

'.format( + pi['url'], pi['credits_caption_display'] + ) + ) author = '' if 'credits' in data: @@ -124,14 +138,29 @@ class TheWashingtonPost(BasicNewsRecipe): x['promo_image']['url'], x['description'].get('basic', '') ) elif x['type'] == 'image': - body += ( + img_ = ( '

{}

'.format( x['url'], x['credits_caption_display'] ) ) + if img_ != promo_img: + body += img_ + elif x['type'] == 'list': + body += '' return ( - '
' + title + subhead + author + body + '
' + '
' + + label + + title + + subhead + + promo_img + + author + + body + + '
' ) def preprocess_html(self, soup): @@ -139,6 +168,6 @@ class TheWashingtonPost(BasicNewsRecipe): img['src'] = ( 'https://www.washingtonpost.com/wp-apps/imrs.php?src=' + img['src'] - + '&w=916' + + '&w=600' ) return soup diff --git a/recipes/wash_post_print.recipe b/recipes/wash_post_print.recipe index 15e851dd38..6d15122032 100644 --- a/recipes/wash_post_print.recipe +++ b/recipes/wash_post_print.recipe @@ -30,12 +30,14 @@ class wapoprint(BasicNewsRecipe): language = 'en_US' remove_attributes = ['style', 'height', 'width'] publication_type = 'newspaper' + resolve_internal_links = True ignore_duplicate_articles = {'title', 'url'} masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg' extra_css = ''' .img { text-align:center; font-size:small; } .auth { font-weight:bold; font-size:small; } .time { font-size:small; color: #202020; } + .subt { font-style: italic; } ''' def get_browser(self, *args, **kwargs): @@ -85,8 +87,22 @@ class wapoprint(BasicNewsRecipe): data = json.loads(m[0].text) data = data['props']['pageProps']['globalContent'] - title = '

' + data['headlines']['basic'] + '

' - subhead = '

' + data['description'].get('basic', '') + '

' + text = data.get('label', {}).get('basic', {}).get('text', '') + label = f'

{text}

' if text else '' + if data.get('headlines'): + title = '

' + data['headlines']['basic'] + '

' + elif data.get('metadata'): + title = '

' + data['metadata']['headlines']['basic'] + '

' + subhead = '

' + data['description'].get('basic', '') + '' + + promo_img = '' + if data.get('promo_items', {}).get('basic', {}).get('type', '') == 'image': + pi = data['promo_items']['basic'] + promo_img = ( + '

{}

'.format( + pi['url'], pi['credits_caption_display'] + ) + ) author = '' if 'credits' in data: @@ -109,14 +125,29 @@ class wapoprint(BasicNewsRecipe): x['promo_image']['url'], x['description'].get('basic', '') ) elif x['type'] == 'image': - body += ( + img_ = ( '

{}

'.format( x['url'], x['credits_caption_display'] ) ) + if img_ != promo_img: + body += img_ + elif x['type'] == 'list': + body += '' return ( - '
' + title + subhead + author + body + '
' + '
' + + label + + title + + subhead + + promo_img + + author + + body + + '
' ) def preprocess_html(self, soup): @@ -124,10 +155,9 @@ class wapoprint(BasicNewsRecipe): img['src'] = ( 'https://www.washingtonpost.com/wp-apps/imrs.php?src=' + img['src'] - + '&w=916' + + '&w=600' ) return soup def populate_article_metadata(self, article, soup, first): - article.summary = self.tag_to_string(soup.find('h3')) - article.text_summary = self.tag_to_string(soup.find('h3')) + article.summary = article.text_summary = self.tag_to_string(soup.find('p', attrs={'class':'subt'}))