This commit is contained in:
Kovid Goyal 2025-06-21 08:04:50 +05:30
commit ea7e12d0f1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -11,6 +11,7 @@ from html5_parser import parse
from lxml import etree from lxml import etree
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -77,10 +78,7 @@ def process_web_node(node):
def load_article_from_web_json(raw): def load_article_from_web_json(raw):
# open('/t/raw.json', 'w').write(raw) # open('/t/raw.json', 'w').write(raw)
body = '' body = ''
try: data = json.loads(raw)['data']['findArticleByUrl']
data = json.loads(raw)['props']['pageProps']['cp2Content']
except Exception:
data = json.loads(raw)['props']['pageProps']['content']
body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>' body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
body += f'<h1>{data["headline"]}</h1>' body += f'<h1>{data["headline"]}</h1>'
if data.get('rubric') and data.get('rubric') is not None: if data.get('rubric') and data.get('rubric') is not None:
@ -187,14 +185,12 @@ class EconomistNews(BasicNewsRecipe):
remove_attributes = ['data-reactid', 'width', 'height'] remove_attributes = ['data-reactid', 'width', 'height']
# economist.com has started throttling after about 60% of the total has # economist.com has started throttling after about 60% of the total has
# downloaded with connection reset by peer (104) errors. # downloaded with connection reset by peer (104) errors.
delay = 3 delay = 1
remove_empty_feeds = True remove_empty_feeds = True
ignore_duplicate_articles = {'title'} ignore_duplicate_articles = {'title'}
needs_subscription = False needs_subscription = False
from_web = False
recipe_specific_options = { recipe_specific_options = {
'days': { 'days': {
'short': 'Oldest article to download from this news source. In days ', 'short': 'Oldest article to download from this news source. In days ',
@ -215,12 +211,6 @@ class EconomistNews(BasicNewsRecipe):
self.oldest_article = float(d) self.oldest_article = float(d)
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
if self.from_web:
kwargs['user_agent'] = (
'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
else:
kwargs['user_agent'] = 'TheEconomist-Liskov-android' kwargs['user_agent'] = 'TheEconomist-Liskov-android'
br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [ br.addheaders += [
@ -231,7 +221,7 @@ class EconomistNews(BasicNewsRecipe):
('x-economist-consumer', 'TheEconomist-Liskov-android'), ('x-economist-consumer', 'TheEconomist-Liskov-android'),
('x-teg-client-name', 'Economist-Android'), ('x-teg-client-name', 'Economist-Android'),
('x-teg-client-os', 'Android'), ('x-teg-client-os', 'Android'),
('x-teg-client-version', '4.30.0'), ('x-teg-client-version', '4.40.0'),
] ]
return br return br
@ -290,7 +280,6 @@ class EconomistNews(BasicNewsRecipe):
self.log('\t', title, '\n\t\t', desc) self.log('\t', title, '\n\t\t', desc)
if articles: if articles:
feeds.append((section, articles)) feeds.append((section, articles))
self.from_web = True
return feeds return feeds
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -305,15 +294,7 @@ class EconomistNews(BasicNewsRecipe):
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8')) # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
root_ = parse(raw) html = load_article_from_web_json(raw)
if '/interactive/' in url:
return ('<html><body><article><h1>' + root_.xpath('//h1')[0].text + '</h1><em>'
'This article is supposed to be read in a browser'
'</em></article></body></html>')
script = root_.xpath('//script[@id="__NEXT_DATA__"]')
html = load_article_from_web_json(script[0].text)
root = parse(html) root = parse(html)
for div in root.xpath('//div[@class="lazy-image"]'): for div in root.xpath('//div[@class="lazy-image"]'):
@ -345,6 +326,23 @@ class EconomistNews(BasicNewsRecipe):
raw = etree.tostring(root, encoding='unicode') raw = etree.tostring(root, encoding='unicode')
return raw return raw
def get_article(self, url):
query = {
'operationName': 'ArticleDeeplinkQuery',
'variables': '{{"ref":"{}"}}'.format(url),
'query': 'query ArticleDeeplinkQuery($ref: String!, $includeRelatedArticles: Boolean = true ) { findArticleByUrl(url: $ref) { __typename ...ArticleDataFragment } } fragment ContentIdentityFragment on ContentIdentity { articleType forceAppWebView leadMediaType } fragment NarrationFragment on Narration { album bitrate duration filename id provider url isAiGenerated fileHash } fragment ImageTeaserFragment on ImageComponent { altText height imageType source url width } fragment PodcastAudioFragment on PodcastEpisode { id audio { url durationInSeconds } } fragment ArticleTeaserFragment on Article { id tegId url rubric headline flyTitle brand byline dateFirstPublished dateline dateModified datePublished dateRevised estimatedReadTime wordCount printHeadline contentIdentity { __typename ...ContentIdentityFragment } section { tegId name } teaserImage { __typename type ...ImageTeaserFragment } leadComponent { __typename type ...ImageTeaserFragment } narration(selectionMethod: PREFER_ACTOR_NARRATION) { __typename ...NarrationFragment } podcast { __typename ...PodcastAudioFragment } } fragment AnnotatedTextFragment on AnnotatedText { text textJson annotations { type length index attributes { name value } } } fragment ImageComponentFragment on ImageComponent { altText caption { __typename ...AnnotatedTextFragment } credit height imageType mode source url width } fragment BlockQuoteComponentFragment on BlockQuoteComponent { text textJson annotations { type length index attributes { name value } } } fragment BookInfoComponentFragment on BookInfoComponent { text textJson annotations { type length index attributes { name value } } } fragment ParagraphComponentFragment on ParagraphComponent { text textJson annotations { type length index attributes { name value } } } fragment PullQuoteComponentFragment on PullQuoteComponent { text textJson annotations { type length index attributes { name value } } } fragment CrossheadComponentFragment on CrossheadComponent { text } fragment OrderedListComponentFragment on OrderedListComponent { items { __typename ...AnnotatedTextFragment } } fragment UnorderedListComponentFragment on UnorderedListComponent { items { __typename ...AnnotatedTextFragment } } fragment VideoComponentFragment on VideoComponent { url title thumbnailImage } fragment InfoboxComponentFragment on InfoboxComponent { components { __typename type ...BlockQuoteComponentFragment ...BookInfoComponentFragment ...ParagraphComponentFragment ...PullQuoteComponentFragment ...CrossheadComponentFragment ...OrderedListComponentFragment ...UnorderedListComponentFragment ...VideoComponentFragment } } fragment InfographicComponentFragment on InfographicComponent { url title width fallback { __typename ...ImageComponentFragment } altText height width } fragment ArticleDataFragment on Article { id url brand byline rubric headline layout { headerStyle } contentIdentity { __typename ...ContentIdentityFragment } dateline dateFirstPublished dateModified datePublished dateRevised estimatedReadTime narration(selectionMethod: PREFER_ACTOR_NARRATION) { __typename ...NarrationFragment } printFlyTitle printHeadline printRubric flyTitle wordCount section { tegId name articles(pagingInfo: { pagingType: OFFSET pageSize: 6 pageNumber: 1 } ) @include(if: $includeRelatedArticles) { edges { node { __typename ...ArticleTeaserFragment } } } } teaserImage { __typename type ...ImageComponentFragment } tegId leadComponent { __typename type ...ImageComponentFragment } body { __typename type ...BlockQuoteComponentFragment ...BookInfoComponentFragment ...ParagraphComponentFragment ...PullQuoteComponentFragment ...CrossheadComponentFragment ...OrderedListComponentFragment ...UnorderedListComponentFragment ...InfoboxComponentFragment ...ImageComponentFragment ...VideoComponentFragment ...InfographicComponentFragment } footer { __typename type ...ParagraphComponentFragment } tags { name } ads { adData } podcast { __typename ...PodcastAudioFragment } }', # noqa: E501
}
url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote)
raw = self.index_to_soup(url, raw=True)
return raw
def print_version(self, url):
art_cont = self.get_article(url)
pt = PersistentTemporaryFile('.html')
pt.write(art_cont)
pt.close()
return 'file:///' + pt.name
def eco_find_image_tables(self, soup): def eco_find_image_tables(self, soup):
for x in soup.findAll('table', align=['right', 'center']): for x in soup.findAll('table', align=['right', 'center']):
if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: