From 4f4af3edf1d6131248f50de70e777c89113178ae Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2017 13:47:07 +0530 Subject: [PATCH] Update The Economist The Economist is apparently doing some A/B testing with a new react based design for its print edition page. --- recipes/economist.recipe | 147 +++++++++++++++++++++++++--------- recipes/economist_free.recipe | 147 +++++++++++++++++++++++++--------- 2 files changed, 216 insertions(+), 78 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index a2a726b39e..f07dc512a2 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -5,12 +5,24 @@ __copyright__ = '2008, Kovid Goyal ' ''' economist.com ''' -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag, NavigableString +import cookielib +import re from collections import OrderedDict -import re -import cookielib +from calibre.ebooks.BeautifulSoup import NavigableString, Tag +from calibre.web.feeds.news import BasicNewsRecipe + + +class NoArticles(Exception): + pass + + +def process_url(url, print_version=True): + if print_version: + url += '/print' + if url.startswith('/'): + url = 'https://www.economist.com' + url + return url class Economist(BasicNewsRecipe): @@ -20,9 +32,11 @@ class Economist(BasicNewsRecipe): __author__ = "Kovid Goyal" INDEX = 'https://www.economist.com/printedition' - description = ('Global news and current affairs from a European' - ' perspective. Best downloaded on Friday mornings (GMT)') - extra_css = ''' + description = ( + 'Global news and current affairs from a European' + ' perspective. Best downloaded on Friday mornings (GMT)' + ) + extra_css = ''' .headline {font-size: x-large;} h2 { font-size: small; } h1 { font-size: medium; } @@ -45,17 +59,22 @@ class Economist(BasicNewsRecipe): oldest_article = 7.0 resolve_internal_links = True remove_tags = [ - dict(name=['script', 'noscript', 'title', - 'iframe', 'cf_floatingcontent']), - dict(attrs={'class': ['dblClkTrk', 'ec-article-info', - 'share_inline_header', 'related-items', - 'main-content-container', 'ec-topic-widget']}), - {'class': lambda x: x and 'share-links-header' in x}, + dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), + dict( + attrs={ + 'class': [ + 'dblClkTrk', 'ec-article-info', 'share_inline_header', + 'related-items', 'main-content-container', 'ec-topic-widget' + ] + } + ), + { + 'class': lambda x: x and 'share-links-header' in x + }, ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True - preprocess_regexps = [(re.compile('.*', re.DOTALL), - lambda x:'')] + preprocess_regexps = [(re.compile('.*', re.DOTALL), lambda x: '')] # economist.com has started throttling after about 60% of the total has # downloaded with connection reset by peer (104) errors. @@ -68,30 +87,81 @@ class Economist(BasicNewsRecipe): # Add a cookie indicating we have accepted Economist's cookie # policy (needed when running from some European countries) ck = cookielib.Cookie( - version=0, name='notice_preferences', value='2:', port=None, - port_specified=False, domain='.economist.com', - domain_specified=False, domain_initial_dot=True, path='/', - path_specified=False, secure=False, expires=None, discard=False, - comment=None, comment_url=None, rest={'HttpOnly': None}, - rfc2109=False) + version=0, + name='notice_preferences', + value='2:', + port=None, + port_specified=False, + domain='.economist.com', + domain_specified=False, + domain_initial_dot=True, + path='/', + path_specified=False, + secure=False, + expires=None, + discard=False, + comment=None, + comment_url=None, + rest={'HttpOnly': None}, + rfc2109=False + ) br.cookiejar.set_cookie(ck) + br.set_handle_gzip(True) return br def parse_index(self): - return self.economist_parse_index() - - def economist_parse_index(self): # return [('Articles', [{'title':'test', # 'url':'https://www.economist.com/news/americas/21699494-guide-cutting-corners-way-jos'}])] - soup = self.index_to_soup(self.INDEX) - div = soup.find('div', attrs={'class': 'issue-image'}) - if div is not None: - img = div.find('img', src=True) - if img is not None: - self.cover_url = re.sub('thumbnail', 'full', img['src']) + raw = self.index_to_soup(self.INDEX, raw=True) + # with open('/t/raw.html', 'wb') as f: + # f.write(raw) + soup = self.index_to_soup(raw) + ans = self.economist_parse_index(soup) + if not ans: + raise NoArticles( + 'Could not find any articles, either the ' + 'economist.com server is having trouble and you should ' + 'try later or the website format has changed and the ' + 'recipe needs to be updated.' + ) + return ans + + def economist_parse_index(self, soup): + img = soup.find(attrs={'class': 'print-edition__cover-widget__image'}) + if img is not None: + self.cover_url = process_url(img['src'], False) + else: + div = soup.find('div', attrs={'class': 'issue-image'}) + if div is not None: + img = div.find('img', src=True) + if img is not None: + self.cover_url = re.sub('thumbnail', 'full', img['src']) + sections = soup.findAll( + 'div', attrs={'class': 'list__title', + 'data-reactid': True} + ) + if sections: + feeds = [] + for section in sections: + articles = [] + secname = self.tag_to_string(section) + self.log(secname) + for a in section.findNextSiblings('a', href=True): + title = ( + self.tag_to_string( + a.find(attrs={'class': 'print-edition__link-title'}) + ) or self.tag_to_string(a) + ) + articles.append({'title': title, 'url': process_url(a['href'])}) + self.log(' ', title, articles[-1]['url']) + if articles: + feeds.append((secname, articles)) + return feeds + self.economist_parse_old_index(soup) + + def economist_parse_old_index(self, soup): feeds = OrderedDict() - for section in soup.findAll(attrs={'class': lambda x: x and 'section' in - x}): + for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}): h4 = section.find('h4') if h4 is None: continue @@ -116,8 +186,12 @@ class Economist(BasicNewsRecipe): if title: title = prefix + title self.log('\tFound article:', title) - articles.append({'title': title, 'url': url, - 'description': '', 'date': ''}) + articles.append({ + 'title': title, + 'url': url, + 'description': '', + 'date': '' + }) if articles: if section_title not in feeds: @@ -125,11 +199,6 @@ class Economist(BasicNewsRecipe): feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] - if not ans: - raise Exception('Could not find any articles, either the ' - 'economist.com server is having trouble and you should ' - 'try later or the website format has changed and the ' - 'recipe needs to be updated.') return ans def eco_find_image_tables(self, soup): diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index a2a726b39e..f07dc512a2 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -5,12 +5,24 @@ __copyright__ = '2008, Kovid Goyal ' ''' economist.com ''' -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag, NavigableString +import cookielib +import re from collections import OrderedDict -import re -import cookielib +from calibre.ebooks.BeautifulSoup import NavigableString, Tag +from calibre.web.feeds.news import BasicNewsRecipe + + +class NoArticles(Exception): + pass + + +def process_url(url, print_version=True): + if print_version: + url += '/print' + if url.startswith('/'): + url = 'https://www.economist.com' + url + return url class Economist(BasicNewsRecipe): @@ -20,9 +32,11 @@ class Economist(BasicNewsRecipe): __author__ = "Kovid Goyal" INDEX = 'https://www.economist.com/printedition' - description = ('Global news and current affairs from a European' - ' perspective. Best downloaded on Friday mornings (GMT)') - extra_css = ''' + description = ( + 'Global news and current affairs from a European' + ' perspective. Best downloaded on Friday mornings (GMT)' + ) + extra_css = ''' .headline {font-size: x-large;} h2 { font-size: small; } h1 { font-size: medium; } @@ -45,17 +59,22 @@ class Economist(BasicNewsRecipe): oldest_article = 7.0 resolve_internal_links = True remove_tags = [ - dict(name=['script', 'noscript', 'title', - 'iframe', 'cf_floatingcontent']), - dict(attrs={'class': ['dblClkTrk', 'ec-article-info', - 'share_inline_header', 'related-items', - 'main-content-container', 'ec-topic-widget']}), - {'class': lambda x: x and 'share-links-header' in x}, + dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), + dict( + attrs={ + 'class': [ + 'dblClkTrk', 'ec-article-info', 'share_inline_header', + 'related-items', 'main-content-container', 'ec-topic-widget' + ] + } + ), + { + 'class': lambda x: x and 'share-links-header' in x + }, ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True - preprocess_regexps = [(re.compile('.*', re.DOTALL), - lambda x:'')] + preprocess_regexps = [(re.compile('.*', re.DOTALL), lambda x: '')] # economist.com has started throttling after about 60% of the total has # downloaded with connection reset by peer (104) errors. @@ -68,30 +87,81 @@ class Economist(BasicNewsRecipe): # Add a cookie indicating we have accepted Economist's cookie # policy (needed when running from some European countries) ck = cookielib.Cookie( - version=0, name='notice_preferences', value='2:', port=None, - port_specified=False, domain='.economist.com', - domain_specified=False, domain_initial_dot=True, path='/', - path_specified=False, secure=False, expires=None, discard=False, - comment=None, comment_url=None, rest={'HttpOnly': None}, - rfc2109=False) + version=0, + name='notice_preferences', + value='2:', + port=None, + port_specified=False, + domain='.economist.com', + domain_specified=False, + domain_initial_dot=True, + path='/', + path_specified=False, + secure=False, + expires=None, + discard=False, + comment=None, + comment_url=None, + rest={'HttpOnly': None}, + rfc2109=False + ) br.cookiejar.set_cookie(ck) + br.set_handle_gzip(True) return br def parse_index(self): - return self.economist_parse_index() - - def economist_parse_index(self): # return [('Articles', [{'title':'test', # 'url':'https://www.economist.com/news/americas/21699494-guide-cutting-corners-way-jos'}])] - soup = self.index_to_soup(self.INDEX) - div = soup.find('div', attrs={'class': 'issue-image'}) - if div is not None: - img = div.find('img', src=True) - if img is not None: - self.cover_url = re.sub('thumbnail', 'full', img['src']) + raw = self.index_to_soup(self.INDEX, raw=True) + # with open('/t/raw.html', 'wb') as f: + # f.write(raw) + soup = self.index_to_soup(raw) + ans = self.economist_parse_index(soup) + if not ans: + raise NoArticles( + 'Could not find any articles, either the ' + 'economist.com server is having trouble and you should ' + 'try later or the website format has changed and the ' + 'recipe needs to be updated.' + ) + return ans + + def economist_parse_index(self, soup): + img = soup.find(attrs={'class': 'print-edition__cover-widget__image'}) + if img is not None: + self.cover_url = process_url(img['src'], False) + else: + div = soup.find('div', attrs={'class': 'issue-image'}) + if div is not None: + img = div.find('img', src=True) + if img is not None: + self.cover_url = re.sub('thumbnail', 'full', img['src']) + sections = soup.findAll( + 'div', attrs={'class': 'list__title', + 'data-reactid': True} + ) + if sections: + feeds = [] + for section in sections: + articles = [] + secname = self.tag_to_string(section) + self.log(secname) + for a in section.findNextSiblings('a', href=True): + title = ( + self.tag_to_string( + a.find(attrs={'class': 'print-edition__link-title'}) + ) or self.tag_to_string(a) + ) + articles.append({'title': title, 'url': process_url(a['href'])}) + self.log(' ', title, articles[-1]['url']) + if articles: + feeds.append((secname, articles)) + return feeds + self.economist_parse_old_index(soup) + + def economist_parse_old_index(self, soup): feeds = OrderedDict() - for section in soup.findAll(attrs={'class': lambda x: x and 'section' in - x}): + for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}): h4 = section.find('h4') if h4 is None: continue @@ -116,8 +186,12 @@ class Economist(BasicNewsRecipe): if title: title = prefix + title self.log('\tFound article:', title) - articles.append({'title': title, 'url': url, - 'description': '', 'date': ''}) + articles.append({ + 'title': title, + 'url': url, + 'description': '', + 'date': '' + }) if articles: if section_title not in feeds: @@ -125,11 +199,6 @@ class Economist(BasicNewsRecipe): feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] - if not ans: - raise Exception('Could not find any articles, either the ' - 'economist.com server is having trouble and you should ' - 'try later or the website format has changed and the ' - 'recipe needs to be updated.') return ans def eco_find_image_tables(self, soup):