From 58670f2fd183149ce0736b18313827ca4ecc6133 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:48:25 +0530 Subject: [PATCH 1/3] Update economist_espresso.recipe --- recipes/economist_espresso.recipe | 295 +++++------------------------- 1 file changed, 49 insertions(+), 246 deletions(-) diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe index 18dba4922d..abbc934b32 100644 --- a/recipes/economist_espresso.recipe +++ b/recipes/economist_espresso.recipe @@ -3,94 +3,10 @@ https://www.economist.com/the-world-in-brief ''' -import json -from urllib.parse import quote, urlencode -from uuid import uuid4 +import re -from html5_parser import parse -from lxml import etree - -from calibre import replace_entities -from calibre.ebooks.BeautifulSoup import NavigableString, Tag -from calibre.ptempfile import PersistentTemporaryFile -from calibre.web.feeds.news import BasicNewsRecipe - - -def E(parent, name, text='', **attrs): - ans = parent.makeelement(name, **attrs) - ans.text = text - parent.append(ans) - return ans - - -def process_node(node, html_parent): - ntype = node.get('type') - if ntype == 'tag': - c = html_parent.makeelement(node['name']) - c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()}) - html_parent.append(c) - for nc in node.get('children', ()): - process_node(nc, c) - elif ntype == 'text': - text = node.get('data') - if text: - text = replace_entities(text) - if len(html_parent): - t = html_parent[-1] - t.tail = (t.tail or '') + text - else: - html_parent.text = (html_parent.text or '') + text - - -def safe_dict(data, *names): - ans = data - for x in names: - ans = ans.get(x) or {} - return ans - - -class JSONHasNoContent(ValueError): - pass - - -def load_article_from_json(raw, root): - # open('/t/raw.json', 'w').write(raw) - data = json.loads(raw) - body = root.xpath('//body')[0] - article = E(body, 'article') - E(article, 'div', data['flyTitle'], style='color: red; font-size:small; font-weight:bold;') - E(article, 'h1', data['title'], title=safe_dict(data, 'url', 'canonical') or '') - E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;') - E(article, 'div', data['byline'], style='font-style: italic; color:#202020;') - main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') - if main_image_url: - div = E(article, 'div') - try: - E(div, 'img', src=main_image_url) - except Exception: - pass - for node in data.get('text') or (): - process_node(node, article) - - -def cleanup_html_article(root): - main = root.xpath('//main')[0] - body = root.xpath('//body')[0] - for child in tuple(body): - body.remove(child) - body.append(main) - main.set('id', '') - main.tag = 'article' - for x in root.xpath('//*[@style]'): - x.set('style', '') - for x in root.xpath('//button'): - x.getparent().remove(x) - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) +from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.news import BasicNewsRecipe, classes def new_tag(soup, name, attrs=()): @@ -100,178 +16,65 @@ def new_tag(soup, name, attrs=()): return Tag(soup, name, attrs=attrs or None) -class NoArticles(Exception): - pass - - -def process_url(url): - if url.startswith('/'): - url = 'https://www.economist.com' + url - return url - - class Espresso(BasicNewsRecipe): title = 'The Economist Espresso' - language = 'en' + language = 'en_GB' __author__ = 'unkn0wn' - encoding = 'utf-8' - masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' - cover_url = 'https://downloadr2.apkmirror.com/wp-content/uploads/2021/10/75/615777cc6611b.png' description = ( - 'Espresso is a rich, full-flavoured shot of daily global analysis' - ' from the editors of The Economist to get you up to speed, fast.' - ' Maximise your understanding of the most significant business, ' - 'economic, political and cultural developments globally.' + 'Espresso is a rich, full-flavoured shot of daily global analysis ' + 'from the editors of The Economist to get you up to speed, fast. ' + 'Maximise your understanding of the most significant business, ' + 'economic, political and cultural developments globally.' ) + cover_url = ( + 'https://downloadr2.apkmirror.com/wp-content/uploads/2021/10/75/615777cc6611b.png' + ) + no_stylesheets = True + remove_attributes = ['height', 'width', 'style'] + use_embedded_content = False + masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' - extra_css = ''' - em { color:#202020; } - img {display:block; margin:0 auto;} - ''' + extra_css = """ + h1 { text-align:center; } + ._main-image, ._description, .sub { text-align:center; font-size:small; } + ._quote-container { font-size:x-large; font-style:italic; color:#202020; } + """ + + keep_only_tags = [dict(name='main', attrs={'id': 'content'})] remove_tags = [ - dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']), - dict(attrs={'aria-label': 'Article Teaser'}), - dict(attrs={ - 'class': [ - 'dblClkTrk', 'ec-article-info', 'share_inline_header', - 'related-items', 'main-content-container', 'ec-topic-widget', - 'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label', - 'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel', - 'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container', - 'latest-updates-panel__article-link','blog-post__section' - ] - } - ), - dict(attrs={ - 'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}), - classes( - 'share-links-header teaser--wrapped latest-updates-panel__container' - ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel' - ) + classes('_podcast-promo _newsletter-promo-container _time-last-updated'), + dict(attrs={'data-test-id': 'twib-audio-player'}), ] - keep_only_tags = [dict(name='article', id=lambda x: not x)] - no_stylesheets = True - remove_attributes = ['data-reactid', 'width', 'height'] - - def get_browser(self, *args, **kwargs): - kwargs['user_agent'] = 'TheEconomist-Lamarr-android' - br = BasicNewsRecipe.get_browser(self, *args, **kwargs) - br.addheaders += [ - ('accept', '*/*'), - ('content-type', 'application/json'), - ('apollographql-client-name', 'mobile-app-apollo'), - ('apollographql-client-version', '3.50.0'), - ('x-request-id', str(uuid4())), - ] - return br - - def economist_return_index(self, ans): - if not ans: - raise NoArticles( - 'Could not find any articles, either the ' - 'economist.com server is having trouble and you should ' - 'try later or the website format has changed and the ' - 'recipe needs to be updated.' - ) - return ans def parse_index(self): - query = { - 'query': 'query EspressoQuery($ref:String!){espresso:canonical(ref:$ref){...EspressoFragment __typename}}fragment EspressoFragment on Content{id type hasPart(size:1 sort:"datePublished:desc"){parts{id type rubric:description hasPart(sort:"publication.context.position:asc,datePublished:desc"){parts{...ArticleFragment __typename}__typename}__typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}', # noqa: E501 - 'operationName': 'EspressoQuery', - 'variables': '{"ref":"/content/ai0db6q5mftflg1irq7hiiofp15t7nlv"}', - } - url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote) - try: - raw = self.index_to_soup(url, raw=True) - except Exception: - raise ValueError('Server is not reachable, try again after some time.') - ans = self.economist_parse_index(raw) - return self.economist_return_index(ans) - - def economist_parse_index(self, raw): - data = json.loads(raw)['data']['espresso']['hasPart']['parts'][0] - self.description = data['rubric'] - - ans = [] - for part in safe_dict(data, 'hasPart', 'parts'): - title = safe_dict(part, 'title') - pt = PersistentTemporaryFile('.html') - pt.write(json.dumps(part).encode('utf-8')) - pt.close() - url = 'file:///' + pt.name - ans.append({'title': title, 'url': url}) - return [('Espresso', ans)] + return [ + ( + 'Espresso', + [ + { + 'title': 'The World in Brief', + 'url': 'https://www.economist.com/the-world-in-brief', + 'description': 'Catch up quickly on the global stories that matter', + }, + ], + ), + ] def preprocess_html(self, soup): + if h1 := soup.find('h1'): + if p := h1.find_next_sibling('p'): + p['class'] = 'sub' + for hr in soup.findAll(attrs={'class': ['_gobbet', '_article']}): + nt = new_tag(soup, 'hr') + hr.append(nt) for img in soup.findAll('img', src=True): - img['src'] = img['src'].replace('economist.com/', - 'economist.com/cdn-cgi/image/width=600,quality=80,format=auto/') + img['src'] = re.sub(r'width=\d+', 'width=600', img['src']) + return soup - def populate_article_metadata(self, article, soup, first): - article.url = soup.find('h1')['title'] - - def preprocess_raw_html(self, raw, url): - # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) - body = '
' - root = parse(body) - load_article_from_json(raw, root) - - for div in root.xpath('//div[@class="lazy-image"]'): - noscript = list(div.iter('noscript')) - if noscript and noscript[0].text: - img = list(parse(noscript[0].text).iter('img')) - if img: - p = noscript[0].getparent() - idx = p.index(noscript[0]) - p.insert(idx, p.makeelement('img', src=img[0].get('src'))) - p.remove(noscript[0]) - for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): - x.getparent().remove(x) - # the economist uses for small caps with a custom font - for init in root.xpath('//span[@data-caps="initial"]'): - init.set('style', 'font-weight:bold;') - for x in root.xpath('//small'): - if x.text and len(x) == 0: - x.text = x.text.upper() - x.tag = 'span' - x.set('style', 'font-variant: small-caps') - for h2 in root.xpath('//h2'): - h2.tag = 'h4' - for x in root.xpath('//figcaption'): - x.set('style', 'text-align:center; font-size:small;') - for x in root.xpath('//cite'): - x.tag = 'blockquote' - x.set('style', 'color:#404040;') - raw = etree.tostring(root, encoding='unicode') - return raw - - def eco_find_image_tables(self, soup): - for x in soup.findAll('table', align=['right', 'center']): - if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: - yield x - - def postprocess_html(self, soup, first): - for img in soup.findAll('img', srcset=True): - del img['srcset'] - for table in list(self.eco_find_image_tables(soup)): - caption = table.find('font') - img = table.find('img') - div = new_tag(soup, 'div') - div['style'] = 'text-align:left;font-size:70%' - ns = NavigableString(self.tag_to_string(caption)) - div.insert(0, ns) - div.insert(1, new_tag(soup, 'br')) - del img['width'] - del img['height'] - img.extract() - div.insert(2, img) - table.replaceWith(div) - return soup - - def canonicalize_internal_url(self, url, is_link=True): - if url.endswith('/print'): - url = url.rpartition('/')[0] - return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link) + def get_browser(self, *args, **kwargs): + kwargs['user_agent'] = ( + 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' + ) + return BasicNewsRecipe.get_browser(self, *args, **kwargs) From 5e9759125574b1cc9f5793173156c547d1cb8a33 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Fri, 18 Apr 2025 17:00:46 +0530 Subject: [PATCH 2/3] ... --- recipes/economist_espresso.recipe | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe index abbc934b32..7ff511ef26 100644 --- a/recipes/economist_espresso.recipe +++ b/recipes/economist_espresso.recipe @@ -44,7 +44,6 @@ class Espresso(BasicNewsRecipe): remove_tags = [ classes('_podcast-promo _newsletter-promo-container _time-last-updated'), - dict(attrs={'data-test-id': 'twib-audio-player'}), ] def parse_index(self): @@ -70,7 +69,10 @@ class Espresso(BasicNewsRecipe): hr.append(nt) for img in soup.findAll('img', src=True): img['src'] = re.sub(r'width=\d+', 'width=600', img['src']) - + if aud := soup.find(attrs={'data-test-id': 'twib-audio-player'}): + if div := aud.find_next('div'): + div.extract() + aud.extract() return soup def get_browser(self, *args, **kwargs): From 35cc158054a6efa54157b21ec2a50e82c6fca58c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Fri, 18 Apr 2025 17:03:57 +0530 Subject: [PATCH 3/3] ... --- recipes/economist_espresso.recipe | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe index 7ff511ef26..bb0cae2bf0 100644 --- a/recipes/economist_espresso.recipe +++ b/recipes/economist_espresso.recipe @@ -36,7 +36,7 @@ class Espresso(BasicNewsRecipe): extra_css = """ h1 { text-align:center; } - ._main-image, ._description, .sub { text-align:center; font-size:small; } + ._main-image, ._description, .sub, .calibre-nuked-tag-figcaption { text-align:center; font-size:small; } ._quote-container { font-size:x-large; font-style:italic; color:#202020; } """ @@ -70,7 +70,7 @@ class Espresso(BasicNewsRecipe): for img in soup.findAll('img', src=True): img['src'] = re.sub(r'width=\d+', 'width=600', img['src']) if aud := soup.find(attrs={'data-test-id': 'twib-audio-player'}): - if div := aud.find_next('div'): + if div := aud.find_next_sibling('div'): div.extract() aud.extract() return soup