#!/usr/bin/env python ''' https://www.economist.com/the-world-in-brief ''' import json from urllib.parse import quote, urlencode from uuid import uuid4 from html5_parser import parse from lxml import etree from calibre import replace_entities from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe def E(parent, name, text='', **attrs): ans = parent.makeelement(name, **attrs) ans.text = text parent.append(ans) return ans def process_node(node, html_parent): ntype = node.get('type') if ntype == 'tag': c = html_parent.makeelement(node['name']) c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()}) html_parent.append(c) for nc in node.get('children', ()): process_node(nc, c) elif ntype == 'text': text = node.get('data') if text: text = replace_entities(text) if len(html_parent): t = html_parent[-1] t.tail = (t.tail or '') + text else: html_parent.text = (html_parent.text or '') + text def safe_dict(data, *names): ans = data for x in names: ans = ans.get(x) or {} return ans class JSONHasNoContent(ValueError): pass def load_article_from_json(raw, root): # open('/t/raw.json', 'w').write(raw) data = json.loads(raw) body = root.xpath('//body')[0] article = E(body, 'article') E(article, 'div', data['flyTitle'], style='color: red; font-size:small; font-weight:bold;') E(article, 'h1', data['title'], title=safe_dict(data, 'url', 'canonical') or '') E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;') E(article, 'div', data['byline'], style='font-style: italic; color:#202020;') main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') if main_image_url: div = E(article, 'div') try: E(div, 'img', src=main_image_url) except Exception: pass for node in data.get('text') or (): process_node(node, article) def cleanup_html_article(root): main = root.xpath('//main')[0] body = root.xpath('//body')[0] for child in tuple(body): body.remove(child) body.append(main) main.set('id', '') main.tag = 'article' for x in root.xpath('//*[@style]'): x.set('style', '') for x in root.xpath('//button'): x.getparent().remove(x) def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class NoArticles(Exception): pass def process_url(url): if url.startswith('/'): url = 'https://www.economist.com' + url return url class Espresso(BasicNewsRecipe): title = 'The Economist Espresso' language = 'en' __author__ = 'unkn0wn' encoding = 'utf-8' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' cover_url = 'https://downloadr2.apkmirror.com/wp-content/uploads/2021/10/75/615777cc6611b.png' description = ( 'Espresso is a rich, full-flavoured shot of daily global analysis' ' from the editors of The Economist to get you up to speed, fast.' ' Maximise your understanding of the most significant business, ' 'economic, political and cultural developments globally.' ) extra_css = ''' em { color:#202020; } img {display:block; margin:0 auto;} ''' remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']), dict(attrs={'aria-label': 'Article Teaser'}), dict(attrs={ 'class': [ 'dblClkTrk', 'ec-article-info', 'share_inline_header', 'related-items', 'main-content-container', 'ec-topic-widget', 'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label', 'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel', 'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container', 'latest-updates-panel__article-link','blog-post__section' ] } ), dict(attrs={ 'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}), classes( 'share-links-header teaser--wrapped latest-updates-panel__container' ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel' ) ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True remove_attributes = ['data-reactid', 'width', 'height'] def get_browser(self, *args, **kwargs): kwargs['user_agent'] = 'TheEconomist-Lamarr-android' br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br.addheaders += [ ('accept', '*/*'), ('content-type', 'application/json'), ('apollographql-client-name', 'mobile-app-apollo'), ('apollographql-client-version', '3.50.0'), ('x-request-id', str(uuid4())), ] return br def economist_return_index(self, ans): if not ans: raise NoArticles( 'Could not find any articles, either the ' 'economist.com server is having trouble and you should ' 'try later or the website format has changed and the ' 'recipe needs to be updated.' ) return ans def parse_index(self): query = { 'query': 'query EspressoQuery($ref:String!){espresso:canonical(ref:$ref){...EspressoFragment __typename}}fragment EspressoFragment on Content{id type hasPart(size:1 sort:"datePublished:desc"){parts{id type rubric:description hasPart(sort:"publication.context.position:asc,datePublished:desc"){parts{...ArticleFragment __typename}__typename}__typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}', # noqa: E501 'operationName': 'EspressoQuery', 'variables': '{"ref":"/content/ai0db6q5mftflg1irq7hiiofp15t7nlv"}', } url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote) try: raw = self.index_to_soup(url, raw=True) except Exception: raise ValueError('Server is not reachable, try again after some time.') ans = self.economist_parse_index(raw) return self.economist_return_index(ans) def economist_parse_index(self, raw): data = json.loads(raw)['data']['espresso']['hasPart']['parts'][0] self.description = data['rubric'] ans = [] for part in safe_dict(data, 'hasPart', 'parts'): title = safe_dict(part, 'title') pt = PersistentTemporaryFile('.html') pt.write(json.dumps(part).encode('utf-8')) pt.close() url = 'file:///' + pt.name ans.append({'title': title, 'url': url}) return [('Espresso', ans)] def preprocess_html(self, soup): for img in soup.findAll('img', src=True): img['src'] = img['src'].replace('economist.com/', 'economist.com/cdn-cgi/image/width=600,quality=80,format=auto/') return soup def populate_article_metadata(self, article, soup, first): article.url = soup.find('h1')['title'] def preprocess_raw_html(self, raw, url): # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) body = '
' root = parse(body) load_article_from_json(raw, root) for div in root.xpath('//div[@class="lazy-image"]'): noscript = list(div.iter('noscript')) if noscript and noscript[0].text: img = list(parse(noscript[0].text).iter('img')) if img: p = noscript[0].getparent() idx = p.index(noscript[0]) p.insert(idx, p.makeelement('img', src=img[0].get('src'))) p.remove(noscript[0]) for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): x.getparent().remove(x) # the economist uses for small caps with a custom font for init in root.xpath('//span[@data-caps="initial"]'): init.set('style', 'font-weight:bold;') for x in root.xpath('//small'): if x.text and len(x) == 0: x.text = x.text.upper() x.tag = 'span' x.set('style', 'font-variant: small-caps') for h2 in root.xpath('//h2'): h2.tag = 'h4' for x in root.xpath('//figcaption'): x.set('style', 'text-align:center; font-size:small;') for x in root.xpath('//cite'): x.tag = 'blockquote' x.set('style', 'color:#404040;') raw = etree.tostring(root, encoding='unicode') return raw def eco_find_image_tables(self, soup): for x in soup.findAll('table', align=['right', 'center']): if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: yield x def postprocess_html(self, soup, first): for img in soup.findAll('img', srcset=True): del img['srcset'] for table in list(self.eco_find_image_tables(soup)): caption = table.find('font') img = table.find('img') div = new_tag(soup, 'div') div['style'] = 'text-align:left;font-size:70%' ns = NavigableString(self.tag_to_string(caption)) div.insert(0, ns) div.insert(1, new_tag(soup, 'br')) del img['width'] del img['height'] img.extract() div.insert(2, img) table.replaceWith(div) return soup def canonicalize_internal_url(self, url, is_link=True): if url.endswith('/print'): url = url.rpartition('/')[0] return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)