From b22c7fddf88242a0f1af5c82e23a498608538603 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Mon, 9 Oct 2023 22:27:17 +0530 Subject: [PATCH] update Washington Post Print --- recipes/wash_post_print.recipe | 54 ++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/recipes/wash_post_print.recipe b/recipes/wash_post_print.recipe index bf033691b3..625bcb68f5 100644 --- a/recipes/wash_post_print.recipe +++ b/recipes/wash_post_print.recipe @@ -2,7 +2,9 @@ washingtonpost.com ''' -from calibre.web.feeds.news import BasicNewsRecipe, classes +from calibre.web.feeds.news import BasicNewsRecipe +from html5_parser import parse +import json class wapoprint(BasicNewsRecipe): title = 'The Washington Post | Print Edition' @@ -22,18 +24,11 @@ class wapoprint(BasicNewsRecipe): remove_attributes = ['style', 'height', 'width'] publication_type = 'newspaper' ignore_duplicate_articles = {'title', 'url'} - - keep_only_tags = [ - dict(name=['h1', 'figure']), - dict(attrs={'data-qa': 'lede-art'}), - classes('byline article-body'), - ] - - remove_tags = [ - dict(name=['meta', 'link', 'svg']), - classes('inline-video author-tooltip author-image powa-wrapper'), - dict(attrs={'data-qa': ['article-body-ad', 'subscribe-promo', 'interstitial-link-wrapper']}), - ] + extra_css = ''' + .img { text-align:center; font-size:small; } + .auth { font-weight:bold; font-size:small; } + .time { font-size:small; color: #202020; } + ''' def parse_index(self): soup = self.index_to_soup('https://www.washingtonpost.com/todays_paper/updates/') @@ -58,7 +53,36 @@ class wapoprint(BasicNewsRecipe): feeds.append((secname, articles)) return feeds + def preprocess_raw_html(self, raw, *a): + root = parse(raw) + m = root.xpath('//script[@id="__NEXT_DATA__"]') + + data = json.loads(m[0].text) + data = data['props']['pageProps']['globalContent'] + + title = '
' + x['content'] + '
' + elif x['type'] == 'video': + if 'promo_image' in x: + body += '