#!/usr/bin/env python2 __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' economist.com ''' import cookielib import re from collections import OrderedDict from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class NoArticles(Exception): pass def process_url(url, print_version=True): if print_version: url += '/print' if url.startswith('/'): url = 'https://www.economist.com' + url return url class Economist(BasicNewsRecipe): title = 'The Economist' language = 'en' __author__ = "Kovid Goyal" INDEX = 'https://www.economist.com/printedition' description = ( 'Global news and current affairs from a European' ' perspective. Best downloaded on Friday mornings (GMT)' ) extra_css = ''' .headline {font-size: x-large;} h2 { font-size: small; } h1 { font-size: medium; } em.Bold {font-weight:bold;font-style:normal;} em.Italic {font-style:italic;} p.xhead {font-weight:bold;} .pullquote { float: right; font-size: larger; font-weight: bold; font-style: italic; page-break-inside:avoid; border-bottom: 3px solid black; border-top: 3px solid black; width: 228px; margin: 0px 0px 10px 15px; padding: 7px 0px 9px; } .flytitle-and-title__flytitle { display: block; font-size: smaller; color: red; } ''' oldest_article = 7.0 resolve_internal_links = True remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']), dict(attrs={ 'class': [ 'dblClkTrk', 'ec-article-info', 'share_inline_header', 'related-items', 'main-content-container', 'ec-topic-widget', 'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label', 'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel', ] } ), classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section'), ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True remove_attributes = ['data-reactid'] # economist.com has started throttling after about 60% of the total has # downloaded with connection reset by peer (104) errors. delay = 1 needs_subscription = False def get_browser(self): br = BasicNewsRecipe.get_browser(self) # Add a cookie indicating we have accepted Economist's cookie # policy (needed when running from some European countries) ck = cookielib.Cookie( version=0, name='notice_preferences', value='2:', port=None, port_specified=False, domain='.economist.com', domain_specified=False, domain_initial_dot=True, path='/', path_specified=False, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False ) br.cookiejar.set_cookie(ck) br.set_handle_gzip(True) return br def preprocess_raw_html(self, raw, url): import html5lib root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml') from lxml import etree for div in root.xpath('//div[@class="lazy-image"]'): noscript = list(div.iter('noscript')) if noscript and noscript[0].text: img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img')) if img: p = noscript[0].getparent() idx = p.index(noscript[0]) p.insert(idx, p.makeelement('img', src=img[0].get('src'))) p.remove(noscript[0]) for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): x.getparent().remove(x) raw = etree.tostring(root, encoding=unicode) return raw def parse_index(self): # return [('Articles', [{'title':'test', # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' # }])] raw = self.index_to_soup(self.INDEX, raw=True) # with open('/t/raw.html', 'wb') as f: # f.write(raw) soup = self.index_to_soup(raw) ans = self.economist_parse_index(soup) if not ans: raise NoArticles( 'Could not find any articles, either the ' 'economist.com server is having trouble and you should ' 'try later or the website format has changed and the ' 'recipe needs to be updated.' ) return ans def economist_parse_index(self, soup): img = soup.find(attrs={'class': 'print-edition__cover-widget__image'}) if img is not None: self.cover_url = process_url(img['src'], False) else: div = soup.find('div', attrs={'class': 'issue-image'}) if div is not None: img = div.find('img', src=True) if img is not None: self.cover_url = re.sub('thumbnail', 'full', img['src']) sections = soup.findAll( 'div', attrs={'class': 'list__title', 'data-reactid': True} ) if sections: feeds = [] for section in sections: articles = [] secname = self.tag_to_string(section) self.log(secname) for a in section.findNextSiblings('a', href=True): spans = a.findAll('span') if len(spans) == 2: title = u'{}: {}'.format(*map(self.tag_to_string, spans)) else: title = self.tag_to_string(a) articles.append({'title': title, 'url': process_url(a['href'])}) self.log(' ', title, articles[-1]['url']) if articles: feeds.append((secname, articles)) return feeds return self.economist_parse_old_index(soup) def economist_parse_old_index(self, soup): feeds = OrderedDict() for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}): h4 = section.find('h4') if h4 is None: continue section_title = self.tag_to_string(h4).strip() if not section_title: continue self.log('Found section: %s' % section_title) articles = [] subsection = '' for node in section.findAll(attrs={'class': 'article'}): subsec = node.findPreviousSibling('h5') if subsec is not None: subsection = self.tag_to_string(subsec) prefix = (subsection + ': ') if subsection else '' a = node.find('a', href=True) if a is not None: url = a['href'] if url.startswith('/'): url = 'https://www.economist.com' + url url += '/print' title = self.tag_to_string(a) if title: title = prefix + title self.log('\tFound article:', title) articles.append({ 'title': title, 'url': url, 'description': '', 'date': '' }) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans def eco_find_image_tables(self, soup): for x in soup.findAll('table', align=['right', 'center']): if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: yield x def postprocess_html(self, soup, first): for table in list(self.eco_find_image_tables(soup)): caption = table.find('font') img = table.find('img') div = Tag(soup, 'div') div['style'] = 'text-align:left;font-size:70%' ns = NavigableString(self.tag_to_string(caption)) div.insert(0, ns) div.insert(1, Tag(soup, 'br')) del img['width'] del img['height'] img.extract() div.insert(2, img) table.replaceWith(div) return soup def canonicalize_internal_url(self, url, is_link=True): if url.endswith('/print'): url = url.rpartition('/')[0] return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)