This commit is contained in:
Kovid Goyal 2020-12-22 14:08:50 +05:30
commit c6d9bce9e8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 62 additions and 17 deletions

View File

@ -5,16 +5,20 @@ try:
from http.cookiejar import Cookie from http.cookiejar import Cookie
except ImportError: except ImportError:
from cookielib import Cookie from cookielib import Cookie
import json
import json
from html5_parser import parse from html5_parser import parse
from lxml import etree from lxml import etree
from calibre import replace_entities from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.date import parse_only_date
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
# For past editions, set date to, for example, '2020-11-28'
edition_date = None
def E(parent, name, text='', **attrs): def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs) ans = parent.makeelement(name, **attrs)
@ -94,7 +98,6 @@ class Economist(BasicNewsRecipe):
language = 'en' language = 'en'
__author__ = "Kovid Goyal" __author__ = "Kovid Goyal"
INDEX = 'https://www.economist.com/printedition'
description = ( description = (
'Global news and current affairs from a European' 'Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)' ' perspective. Best downloaded on Friday mornings (GMT)'
@ -224,11 +227,21 @@ class Economist(BasicNewsRecipe):
article.summary = u'. '.join(result) + u'.' article.summary = u'. '.join(result) + u'.'
article.text_summary = clean_ascii_chars(article.summary) article.text_summary = clean_ascii_chars(article.summary)
def publication_date(self):
if edition_date:
return parse_only_date(edition_date, as_utc=False)
return BasicNewsRecipe.publication_date(self)
def parse_index(self): def parse_index(self):
# return [('Articles', [{'title':'test', # return [('Articles', [{'title':'test',
# 'url':'file:///t/raw.html' # 'url':'file:///t/raw.html'
# }])] # }])]
raw = self.index_to_soup(self.INDEX, raw=True) if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']'
else:
url = 'https://www.economist.com/printedition'
raw = self.index_to_soup(url, raw=True)
# with open('/t/raw.html', 'wb') as f: # with open('/t/raw.html', 'wb') as f:
# f.write(raw) # f.write(raw)
soup = self.index_to_soup(raw) soup = self.index_to_soup(raw)
@ -249,13 +262,21 @@ class Economist(BasicNewsRecipe):
return ans return ans
def economist_parse_index(self, soup): def economist_parse_index(self, soup):
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") img = None
div = archive.find(attrs={'class': 'edition-teaser__image'}) if edition_date:
if div is not None: archive_url = "https://www.economist.com/weeklyedition/archive?year={}".format(edition_date[:4])
img = div.find('img', srcset=True) archive = self.index_to_soup(archive_url)
q = edition_date.replace('-', '')
q = '/print-covers/{}_'.format(q)
img = archive.find('img', srcset=lambda x: x and q in x)
else:
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive")
div = archive.find(attrs={'class': 'edition-teaser__image'})
if div is not None:
img = div.find('img', srcset=True)
if img:
self.cover_url = img['srcset'].split(',')[-1].split()[0] self.cover_url = img['srcset'].split(',')[-1].split()[0]
self.log('Got cover:', self.cover_url) self.log('Got cover:', self.cover_url)
feeds = [] feeds = []
for section in soup.findAll(**classes('layout-weekly-edition-section')): for section in soup.findAll(**classes('layout-weekly-edition-section')):
h2 = section.find('h2') h2 = section.find('h2')

View File

@ -5,16 +5,20 @@ try:
from http.cookiejar import Cookie from http.cookiejar import Cookie
except ImportError: except ImportError:
from cookielib import Cookie from cookielib import Cookie
import json
import json
from html5_parser import parse from html5_parser import parse
from lxml import etree from lxml import etree
from calibre import replace_entities from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.date import parse_only_date
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
# For past editions, set date to, for example, '2020-11-28'
edition_date = None
def E(parent, name, text='', **attrs): def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs) ans = parent.makeelement(name, **attrs)
@ -94,7 +98,6 @@ class Economist(BasicNewsRecipe):
language = 'en' language = 'en'
__author__ = "Kovid Goyal" __author__ = "Kovid Goyal"
INDEX = 'https://www.economist.com/printedition'
description = ( description = (
'Global news and current affairs from a European' 'Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)' ' perspective. Best downloaded on Friday mornings (GMT)'
@ -224,11 +227,21 @@ class Economist(BasicNewsRecipe):
article.summary = u'. '.join(result) + u'.' article.summary = u'. '.join(result) + u'.'
article.text_summary = clean_ascii_chars(article.summary) article.text_summary = clean_ascii_chars(article.summary)
def publication_date(self):
if edition_date:
return parse_only_date(edition_date, as_utc=False)
return BasicNewsRecipe.publication_date(self)
def parse_index(self): def parse_index(self):
# return [('Articles', [{'title':'test', # return [('Articles', [{'title':'test',
# 'url':'file:///t/raw.html' # 'url':'file:///t/raw.html'
# }])] # }])]
raw = self.index_to_soup(self.INDEX, raw=True) if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']'
else:
url = 'https://www.economist.com/printedition'
raw = self.index_to_soup(url, raw=True)
# with open('/t/raw.html', 'wb') as f: # with open('/t/raw.html', 'wb') as f:
# f.write(raw) # f.write(raw)
soup = self.index_to_soup(raw) soup = self.index_to_soup(raw)
@ -249,13 +262,21 @@ class Economist(BasicNewsRecipe):
return ans return ans
def economist_parse_index(self, soup): def economist_parse_index(self, soup):
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive") img = None
div = archive.find(attrs={'class': 'edition-teaser__image'}) if edition_date:
if div is not None: archive_url = "https://www.economist.com/weeklyedition/archive?year={}".format(edition_date[:4])
img = div.find('img', srcset=True) archive = self.index_to_soup(archive_url)
q = edition_date.replace('-', '')
q = '/print-covers/{}_'.format(q)
img = archive.find('img', srcset=lambda x: x and q in x)
else:
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive")
div = archive.find(attrs={'class': 'edition-teaser__image'})
if div is not None:
img = div.find('img', srcset=True)
if img:
self.cover_url = img['srcset'].split(',')[-1].split()[0] self.cover_url = img['srcset'].split(',')[-1].split()[0]
self.log('Got cover:', self.cover_url) self.log('Got cover:', self.cover_url)
feeds = [] feeds = []
for section in soup.findAll(**classes('layout-weekly-edition-section')): for section in soup.findAll(**classes('layout-weekly-edition-section')):
h2 = section.find('h2') h2 = section.find('h2')

View File

@ -1449,6 +1449,9 @@ class BasicNewsRecipe(Recipe):
def prepare_masthead_image(self, path_to_image, out_path): def prepare_masthead_image(self, path_to_image, out_path):
prepare_masthead_image(path_to_image, out_path, self.MI_WIDTH, self.MI_HEIGHT) prepare_masthead_image(path_to_image, out_path, self.MI_WIDTH, self.MI_HEIGHT)
def publication_date(self):
return nowf()
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
@ -1477,7 +1480,7 @@ class BasicNewsRecipe(Recipe):
language = canonicalize_lang(self.language) language = canonicalize_lang(self.language)
if language is not None: if language is not None:
mi.language = language mi.language = language
mi.pubdate = nowf() mi.pubdate = self.publication_date()
opf_path = os.path.join(dir, 'index.opf') opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx') ncx_path = os.path.join(dir, 'index.ncx')