This commit is contained in:
Kovid Goyal 2020-12-22 14:08:50 +05:30
commit c6d9bce9e8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 62 additions and 17 deletions

View File

@ -5,16 +5,20 @@ try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
import json
import json
from html5_parser import parse
from lxml import etree
from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.date import parse_only_date
from calibre.web.feeds.news import BasicNewsRecipe
# For past editions, set date to, for example, '2020-11-28'
edition_date = None
def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs)
@ -94,7 +98,6 @@ class Economist(BasicNewsRecipe):
language = 'en'
__author__ = "Kovid Goyal"
INDEX = 'https://www.economist.com/printedition'
description = (
'Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)'
@ -224,11 +227,21 @@ class Economist(BasicNewsRecipe):
article.summary = u'. '.join(result) + u'.'
article.text_summary = clean_ascii_chars(article.summary)
def publication_date(self):
if edition_date:
return parse_only_date(edition_date, as_utc=False)
return BasicNewsRecipe.publication_date(self)
def parse_index(self):
# return [('Articles', [{'title':'test',
# 'url':'file:///t/raw.html'
# }])]
raw = self.index_to_soup(self.INDEX, raw=True)
if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']'
else:
url = 'https://www.economist.com/printedition'
raw = self.index_to_soup(url, raw=True)
# with open('/t/raw.html', 'wb') as f:
# f.write(raw)
soup = self.index_to_soup(raw)
@ -249,13 +262,21 @@ class Economist(BasicNewsRecipe):
return ans
def economist_parse_index(self, soup):
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive")
div = archive.find(attrs={'class': 'edition-teaser__image'})
if div is not None:
img = div.find('img', srcset=True)
img = None
if edition_date:
archive_url = "https://www.economist.com/weeklyedition/archive?year={}".format(edition_date[:4])
archive = self.index_to_soup(archive_url)
q = edition_date.replace('-', '')
q = '/print-covers/{}_'.format(q)
img = archive.find('img', srcset=lambda x: x and q in x)
else:
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive")
div = archive.find(attrs={'class': 'edition-teaser__image'})
if div is not None:
img = div.find('img', srcset=True)
if img:
self.cover_url = img['srcset'].split(',')[-1].split()[0]
self.log('Got cover:', self.cover_url)
feeds = []
for section in soup.findAll(**classes('layout-weekly-edition-section')):
h2 = section.find('h2')

View File

@ -5,16 +5,20 @@ try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
import json
import json
from html5_parser import parse
from lxml import etree
from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.date import parse_only_date
from calibre.web.feeds.news import BasicNewsRecipe
# For past editions, set date to, for example, '2020-11-28'
edition_date = None
def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs)
@ -94,7 +98,6 @@ class Economist(BasicNewsRecipe):
language = 'en'
__author__ = "Kovid Goyal"
INDEX = 'https://www.economist.com/printedition'
description = (
'Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)'
@ -224,11 +227,21 @@ class Economist(BasicNewsRecipe):
article.summary = u'. '.join(result) + u'.'
article.text_summary = clean_ascii_chars(article.summary)
def publication_date(self):
if edition_date:
return parse_only_date(edition_date, as_utc=False)
return BasicNewsRecipe.publication_date(self)
def parse_index(self):
# return [('Articles', [{'title':'test',
# 'url':'file:///t/raw.html'
# }])]
raw = self.index_to_soup(self.INDEX, raw=True)
if edition_date:
url = 'https://www.economist.com/weeklyedition/' + edition_date
self.timefmt = ' [' + edition_date + ']'
else:
url = 'https://www.economist.com/printedition'
raw = self.index_to_soup(url, raw=True)
# with open('/t/raw.html', 'wb') as f:
# f.write(raw)
soup = self.index_to_soup(raw)
@ -249,13 +262,21 @@ class Economist(BasicNewsRecipe):
return ans
def economist_parse_index(self, soup):
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive")
div = archive.find(attrs={'class': 'edition-teaser__image'})
if div is not None:
img = div.find('img', srcset=True)
img = None
if edition_date:
archive_url = "https://www.economist.com/weeklyedition/archive?year={}".format(edition_date[:4])
archive = self.index_to_soup(archive_url)
q = edition_date.replace('-', '')
q = '/print-covers/{}_'.format(q)
img = archive.find('img', srcset=lambda x: x and q in x)
else:
archive = self.index_to_soup("https://www.economist.com/weeklyedition/archive")
div = archive.find(attrs={'class': 'edition-teaser__image'})
if div is not None:
img = div.find('img', srcset=True)
if img:
self.cover_url = img['srcset'].split(',')[-1].split()[0]
self.log('Got cover:', self.cover_url)
feeds = []
for section in soup.findAll(**classes('layout-weekly-edition-section')):
h2 = section.find('h2')

View File

@ -1449,6 +1449,9 @@ class BasicNewsRecipe(Recipe):
def prepare_masthead_image(self, path_to_image, out_path):
prepare_masthead_image(path_to_image, out_path, self.MI_WIDTH, self.MI_HEIGHT)
def publication_date(self):
return nowf()
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
@ -1477,7 +1480,7 @@ class BasicNewsRecipe(Recipe):
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
mi.pubdate = nowf()
mi.pubdate = self.publication_date()
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')