From 526969213cab6b1bba61881c97c56f718a181e72 Mon Sep 17 00:00:00 2001 From: shinozukayohei <76410060+shinozukayohei@users.noreply.github.com> Date: Sun, 20 Dec 2020 15:17:03 -0800 Subject: [PATCH 1/2] Get a past edition (e.g., date='2020-11-28'). A new function edition(date) allows the user to specify a past edition. The default, date='', takes the current edition. The proper cover URL appears in the Calibre desktop app, though not on Kindle Oasis - a minor remaining issue. Much respect and admiration for Kovid Goyal's work. --- recipes/economist.recipe | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 630d055bc1..44ee659584 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -1,6 +1,8 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2008, Kovid Goyal +import urllib.request + try: from http.cookiejar import Cookie except ImportError: @@ -15,6 +17,14 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.utils.cleantext import clean_ascii_chars from calibre.web.feeds.news import BasicNewsRecipe +def edition(date=''): + """ For past editions, set date to, for example, '2020-11-28'. """ + title = 'The Economist' + INDEX = 'https://www.economist.com/weeklyedition' + if date: + title += ' ' + date + INDEX += '/' + date + return title, INDEX def E(parent, name, text='', **attrs): ans = parent.makeelement(name, **attrs) @@ -90,11 +100,10 @@ def process_url(url): class Economist(BasicNewsRecipe): - title = 'The Economist' + title, INDEX = edition() language = 'en' __author__ = "Kovid Goyal" - INDEX = 'https://www.economist.com/printedition' description = ( 'Global news and current affairs from a European' ' perspective. Best downloaded on Friday mornings (GMT)' @@ -249,13 +258,21 @@ class Economist(BasicNewsRecipe): return ans def economist_parse_index(self, soup): - archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") - div = archive.find(attrs={'class': 'edition-teaser__image'}) - if div is not None: - img = div.find('img', srcset=True) - self.cover_url = img['srcset'].split(',')[-1].split()[0] - self.log('Got cover:', self.cover_url) - + if self.INDEX.endswith('weeklyedition'): + archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") + div = archive.find(attrs={'class': 'edition-teaser__image'}) + if div is not None: + img = div.find('img', srcset=True) + self.cover_url = img['srcset'].split(',')[-1].split()[0] + self.log('Got cover:', self.cover_url) + else: + date8 = self.INDEX[-10:].replace('-', '') + resource = urllib.request.urlopen("https://www.economist.com/weeklyedition/archive?year={}".format(date8[:4])) + archive = resource.read().decode(resource.headers.get_content_charset()) + if date8 in archive: + parts = archive.split(date8) + self.cover_url = parts[-3].split(',')[-1]+date8+parts[-2].split()[0] + self.log('Got cover:', self.cover_url) feeds = [] for section in soup.findAll(**classes('layout-weekly-edition-section')): h2 = section.find('h2') From 7567ced5288ab0e9a420c9622c54a1476bc884cc Mon Sep 17 00:00:00 2001 From: shinozukayohei <76410060+shinozukayohei@users.noreply.github.com> Date: Mon, 21 Dec 2020 12:18:27 -0800 Subject: [PATCH 2/2] Replace urllib with self.index_to_soup() urllib is no longer used to make requests. Instead self.index_to_soup() is called, with raw=True. --- recipes/economist.recipe | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 44ee659584..f91c88d2cf 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -1,8 +1,6 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2008, Kovid Goyal -import urllib.request - try: from http.cookiejar import Cookie except ImportError: @@ -267,8 +265,8 @@ class Economist(BasicNewsRecipe): self.log('Got cover:', self.cover_url) else: date8 = self.INDEX[-10:].replace('-', '') - resource = urllib.request.urlopen("https://www.economist.com/weeklyedition/archive?year={}".format(date8[:4])) - archive = resource.read().decode(resource.headers.get_content_charset()) + archive_url = "https://www.economist.com/weeklyedition/archive?year={}".format(date8[:4]) + archive = self.index_to_soup(archive_url, raw=True).decode("utf-8") if date8 in archive: parts = archive.split(date8) self.cover_url = parts[-3].split(',')[-1]+date8+parts[-2].split()[0]