From 7fe681d98feb6fb5da81588736ea1acd7fb2b134 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 19 Apr 2025 08:22:16 +0530 Subject: [PATCH] Update wash_post_print.recipe --- recipes/wash_post_print.recipe | 63 +++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/recipes/wash_post_print.recipe b/recipes/wash_post_print.recipe index 14db183c43..15e851dd38 100644 --- a/recipes/wash_post_print.recipe +++ b/recipes/wash_post_print.recipe @@ -15,10 +15,12 @@ class wapoprint(BasicNewsRecipe): title = 'The Washington Post | Print Edition' __author__ = 'unkn0wn' description = ( - 'Leading source for news, video and opinion on politics, business, world and national news, science,' - ' travel, entertainment and more. Our local coverage includes reporting on education, crime, weather,' - ' traffic, real estate, jobs and cars for DC, Maryland and Virginia. Offering award-winning opinion writing,' - ' entertainment information and restaurant reviews.' + 'Leading source for news, video and opinion on politics, business, ' + 'world and national news, science, travel, entertainment and more. ' + 'Our local coverage includes reporting on education, crime, weather, ' + 'traffic, real estate, jobs and cars for DC, Maryland and Virginia. ' + 'Offering award-winning opinion writing, entertainment information ' + 'and restaurant reviews.' ) publisher = 'The Washington Post Company' category = 'news, politics, USA' @@ -29,22 +31,33 @@ class wapoprint(BasicNewsRecipe): remove_attributes = ['style', 'height', 'width'] publication_type = 'newspaper' ignore_duplicate_articles = {'title', 'url'} - masthead_url = 'https://www.washingtonpost.com/pb/resources/img/twp-masthead-415x57.svg' + masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/93/The_Logo_of_The_Washington_Post_Newspaper.svg' extra_css = ''' .img { text-align:center; font-size:small; } .auth { font-weight:bold; font-size:small; } .time { font-size:small; color: #202020; } ''' - def get_browser(self): - return BasicNewsRecipe.get_browser( - self, verify_ssl_certificates=False, user_agent='Mozilla/5.0 (Windows NT 10.0; rv:128.0) Gecko/20100101 Firefox/128.0') + def get_browser(self, *args, **kwargs): + kwargs['user_agent'] = ( + 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' + ) + br = BasicNewsRecipe.get_browser(self, *args, **kwargs) + br.addheaders += [ + ('Referer', 'https://www.google.com/'), + ('X-Forwarded-For', '66.249.66.1'), + ] + return br def parse_index(self): soup = self.index_to_soup('https://www.washingtonpost.com/todays_paper/updates/') - if img := soup.find('img', attrs={'src':lambda x: x and x.endswith('_FrontPage.png')}): - self.cover_url = img['src'] - if h2 := soup.find('h2', attrs={'class':lambda x: x and 'font--subhead' in x.split()}): + if img := soup.find( + 'img', attrs={'src': lambda x: x and x.endswith('_FrontPage.png')} + ): + self.cover_url = img['src'] + if h2 := soup.find( + 'h2', attrs={'class': lambda x: x and 'font--subhead' in x.split()} + ): self.title = 'WaPo Print | ' + self.tag_to_string(h2) feeds = [] @@ -77,8 +90,14 @@ class wapoprint(BasicNewsRecipe): author = '' if 'credits' in data: - author = ('