diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 630d055bc1..a2113f1c2f 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -5,16 +5,20 @@ try: from http.cookiejar import Cookie except ImportError: from cookielib import Cookie -import json +import json from html5_parser import parse from lxml import etree from calibre import replace_entities from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.date import parse_only_date from calibre.web.feeds.news import BasicNewsRecipe +# For past editions, set date to, for example, '2020-11-28' +edition_date = None + def E(parent, name, text='', **attrs): ans = parent.makeelement(name, **attrs) @@ -94,7 +98,6 @@ class Economist(BasicNewsRecipe): language = 'en' __author__ = "Kovid Goyal" - INDEX = 'https://www.economist.com/printedition' description = ( 'Global news and current affairs from a European' ' perspective. Best downloaded on Friday mornings (GMT)' @@ -224,11 +227,21 @@ class Economist(BasicNewsRecipe): article.summary = u'. '.join(result) + u'.' article.text_summary = clean_ascii_chars(article.summary) + def publication_date(self): + if edition_date: + return parse_only_date(edition_date, as_utc=False) + return BasicNewsRecipe.publication_date(self) + def parse_index(self): # return [('Articles', [{'title':'test', # 'url':'file:///t/raw.html' # }])] - raw = self.index_to_soup(self.INDEX, raw=True) + if edition_date: + url = 'https://www.economist.com/weeklyedition/' + edition_date + self.timefmt = ' [' + edition_date + ']' + else: + url = 'https://www.economist.com/printedition' + raw = self.index_to_soup(url, raw=True) # with open('/t/raw.html', 'wb') as f: # f.write(raw) soup = self.index_to_soup(raw) @@ -249,13 +262,21 @@ class Economist(BasicNewsRecipe): return ans def economist_parse_index(self, soup): - archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") - div = archive.find(attrs={'class': 'edition-teaser__image'}) - if div is not None: - img = div.find('img', srcset=True) + img = None + if edition_date: + archive_url = "https://www.economist.com/weeklyedition/archive?year={}".format(edition_date[:4]) + archive = self.index_to_soup(archive_url) + q = edition_date.replace('-', '') + q = '/print-covers/{}_'.format(q) + img = archive.find('img', srcset=lambda x: x and q in x) + else: + archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") + div = archive.find(attrs={'class': 'edition-teaser__image'}) + if div is not None: + img = div.find('img', srcset=True) + if img: self.cover_url = img['srcset'].split(',')[-1].split()[0] self.log('Got cover:', self.cover_url) - feeds = [] for section in soup.findAll(**classes('layout-weekly-edition-section')): h2 = section.find('h2') diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index 630d055bc1..a2113f1c2f 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -5,16 +5,20 @@ try: from http.cookiejar import Cookie except ImportError: from cookielib import Cookie -import json +import json from html5_parser import parse from lxml import etree from calibre import replace_entities from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.date import parse_only_date from calibre.web.feeds.news import BasicNewsRecipe +# For past editions, set date to, for example, '2020-11-28' +edition_date = None + def E(parent, name, text='', **attrs): ans = parent.makeelement(name, **attrs) @@ -94,7 +98,6 @@ class Economist(BasicNewsRecipe): language = 'en' __author__ = "Kovid Goyal" - INDEX = 'https://www.economist.com/printedition' description = ( 'Global news and current affairs from a European' ' perspective. Best downloaded on Friday mornings (GMT)' @@ -224,11 +227,21 @@ class Economist(BasicNewsRecipe): article.summary = u'. '.join(result) + u'.' article.text_summary = clean_ascii_chars(article.summary) + def publication_date(self): + if edition_date: + return parse_only_date(edition_date, as_utc=False) + return BasicNewsRecipe.publication_date(self) + def parse_index(self): # return [('Articles', [{'title':'test', # 'url':'file:///t/raw.html' # }])] - raw = self.index_to_soup(self.INDEX, raw=True) + if edition_date: + url = 'https://www.economist.com/weeklyedition/' + edition_date + self.timefmt = ' [' + edition_date + ']' + else: + url = 'https://www.economist.com/printedition' + raw = self.index_to_soup(url, raw=True) # with open('/t/raw.html', 'wb') as f: # f.write(raw) soup = self.index_to_soup(raw) @@ -249,13 +262,21 @@ class Economist(BasicNewsRecipe): return ans def economist_parse_index(self, soup): - archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") - div = archive.find(attrs={'class': 'edition-teaser__image'}) - if div is not None: - img = div.find('img', srcset=True) + img = None + if edition_date: + archive_url = "https://www.economist.com/weeklyedition/archive?year={}".format(edition_date[:4]) + archive = self.index_to_soup(archive_url) + q = edition_date.replace('-', '') + q = '/print-covers/{}_'.format(q) + img = archive.find('img', srcset=lambda x: x and q in x) + else: + archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") + div = archive.find(attrs={'class': 'edition-teaser__image'}) + if div is not None: + img = div.find('img', srcset=True) + if img: self.cover_url = img['srcset'].split(',')[-1].split()[0] self.log('Got cover:', self.cover_url) - feeds = [] for section in soup.findAll(**classes('layout-weekly-edition-section')): h2 = section.find('h2') diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 3ed920f5dc..b0361e3e80 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -1449,6 +1449,9 @@ class BasicNewsRecipe(Recipe): def prepare_masthead_image(self, path_to_image, out_path): prepare_masthead_image(path_to_image, out_path, self.MI_WIDTH, self.MI_HEIGHT) + def publication_date(self): + return nowf() + def create_opf(self, feeds, dir=None): if dir is None: dir = self.output_dir @@ -1477,7 +1480,7 @@ class BasicNewsRecipe(Recipe): language = canonicalize_lang(self.language) if language is not None: mi.language = language - mi.pubdate = nowf() + mi.pubdate = self.publication_date() opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx')