This commit is contained in:
Kovid Goyal 2025-06-08 20:11:32 +05:30
commit 5034b00fcd
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 259 additions and 424 deletions

View File

@ -1,182 +0,0 @@
#!/usr/bin/env python
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2011, Davide Cavalca <davide125 at tiscali.it>'
'''
lwn.net
'''
import re
import sys
from calibre.utils.date import now as nowf
from calibre.web.feeds.news import BasicNewsRecipe
class WeeklyLWN(BasicNewsRecipe):
title = 'LWN.net Weekly Edition'
description = 'Weekly summary of what has happened in the free software world.'
__author__ = 'Davide Cavalca'
language = 'en'
site_url = u'http://lwn.net'
no_stylesheets = True
remove_javascript = True
extra_css = '''pre,code,samp,kbd,tt { font-size: 80% }
blockquote {margin-left:0 }
DIV.BigQuote,SPAN { font-style:oblique }
DIV.FeatureByline,DIV.GaByline { background-color:#EEE }
DIV.tlr { background-color:#EED; border-style:groove; }
* { color: black }'''
cover_url = site_url + '/images/lcorner.png'
# masthead_url = 'http://lwn.net/images/lcorner.png'
publication_type = 'magazine'
keep_only_tags = [dict(attrs={'class': ['PageHeadline', 'ArticleText']})]
remove_tags = [dict(name=['h2', 'form'])]
preprocess_regexps = [
# Remove the <hr> and "Log in to post comments"
(re.compile(r'<hr [^>]+>\s*\n\s*.*?comments[)]'), lambda m: ''),
]
conversion_options = {
'no_inline_navbars': True,
}
oldest_article = 7.0
needs_subscription = 'optional'
recipe_specific_options = {
'issue': {
'short': 'The ID of the edition to download',
'long': 'For example, 998950\nHint: The ID can be found within the edition URL'
}
}
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('https://lwn.net/login')
br.select_form(name='loginform')
br['uname'] = self.username
br['pword'] = self.password
br.submit()
return br
def print_version(self, url):
# Strip off anchor
url = url.split('#')[0]
# Prepend site_url
if url[0:len(self.site_url)] != self.site_url:
url = self.site_url + url
# Append printable URL parameter
print_param = '?format=printable'
if url[-len(print_param):] != print_param:
url += print_param
return url
def publication_date(self):
return self.pub_date
def parse_publication_date(self, soup):
from dateutil.parser import ParserError, parse
try:
date_match = re.match(r'.* +for +([^\[]*)', self.tag_to_string(soup.head.title.string))
# dateutil.parser.parse() is considered thread-safe
self.pub_date = parse(date_match[1])
except (TypeError, ParserError):
self.log.warning('Failed to parse publication date from title: %r, using current time' % soup.head.title.string)
self.pub_date = nowf()
def parse_index(self):
past_edition = self.recipe_specific_options.get('issue')
if past_edition and isinstance(past_edition, str):
index_url = self.print_version(f'/Articles/{past_edition}/bigpage')
elif self.username is not None and self.password is not None:
index_url = self.print_version('/current/bigpage')
else:
index_url = self.print_version('/free/bigpage')
soup = self.index_to_soup(index_url)
self.parse_publication_date(soup)
curr = soup.body
articles = {}
ans = []
section = self.tag_to_string(soup.title)
subsection = None
while True:
curr = curr.findNext(
attrs={'class': ['SummaryHL', 'Cat1HL', 'Cat2HL']})
if curr is None:
break
text = self.tag_to_string(curr.contents[0])
cclass = ''.join(curr['class'])
if 'Cat2HL' in cclass:
subsection = text
elif 'Cat1HL' in cclass:
section = text
subsection = None
elif 'SummaryHL' in cclass:
article_title = text
if not article_title:
article_title = _('Undefined article title')
if subsection:
section_title = '%s: %s' % (section, subsection)
else:
section_title = section
# Most articles have anchors in their titles, *except* the
# security vulnerabilities
article_anchor = curr.find(
name='a', attrs={'href': re.compile(r'^/Articles/')})
if article_anchor:
article_url = article_anchor.get('href')
if not article_url:
print('article_url is None for article_anchor "%s": "%s"'
% (str(article_anchor), article_title), file=sys.stderr)
continue
else:
self.log.warn('article_anchor is None for "%s"; skipping' % article_title)
article_url = None
continue
if section_title not in articles:
articles[section_title] = []
if section_title not in ans:
ans.append(section_title)
articles[section_title].append({
'url': article_url,
'title': article_title,
'description': '', 'content': '', 'date': '',
})
else:
self.log.error('lwn_weekly.recipe: something bad happened; should not be able to reach this')
ans = [(section2, articles[section2])
for section2 in ans if section2 in articles]
# from pprint import pprint
# pprint(ans)
return ans
# vim: expandtab:ts=4:sw=4

View File

@ -1,105 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Windsor Star
title = u'Windsor Star'
url_prefix = 'http://www.windsorstar.com'
description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
# # title = u'Ottawa Citizen'
# # url_prefix = 'http://www.ottawacitizen.com'
# # description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
# # title = u'Montreal Gazette'
# # url_prefix = 'http://www.montrealgazette.com'
# # description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict(
name='div', attrs={'id': 'storycontent'})]
remove_tags = [{'class': 'comments'},
dict(name='div', attrs={'class': 'navbar'}), dict(
name='div', attrs={'class': 'morelinks'}),
dict(name='div', attrs={'class': 'viewmore'}), dict(
name='li', attrs={'class': 'email'}),
dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
name='div', attrs={'class': 'clear'}),
dict(name='div', attrs={'class': 'story_tool'}), dict(
name='div', attrs={'class': 'copyright'}),
dict(name='div', attrs={'class': 'rule_grey_solid'}),
dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
def preprocess_html(self, soup):
# delete iempty id attributes--they screw up the TOC for unknown reasons
divtags = soup.findAll('div', attrs={'id': ''})
if divtags:
for div in divtags:
del div['id']
return soup
def parse_index(self):
soup = self.index_to_soup(
self.url_prefix + '/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div', attrs={'class': ['section_title02', 'featurecontent']}):
if 'section_title' in ''.join(divtag['class']):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3, False)
ans.append(key)
self.log('Section name %s' % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a', href=True)
if not atag:
continue
url = self.url_prefix + '/news/todays-paper/' + atag['href']
title = self.tag_to_string(atag, False)
pubdate = ''
description = ''
ptag = divtag.find('p')
if ptag:
description = self.tag_to_string(ptag, False)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag, False)
if key not in articles:
articles[key] = []
articles[key].append(dict(title=title, url=url, date=pubdate,
description=description, author=author, content=''))
ans = [(keyl, articles[key]) for keyl in ans if keyl in articles]
return ans

View File

@ -1,20 +1,40 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
import json import json
import time
from datetime import datetime, timedelta
from itertools import zip_longest from itertools import zip_longest
from urllib.parse import quote, urlencode
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
def get_article(article_id):
from mechanize import Request
mat_url = 'https://mats.mobile.dowjones.io/translate/' + article_id + '/jpml'
headers = {
'User-Agent': 'okhttp/4.10.0',
'Accept-Encoding': 'gzip',
'Cache-Control': 'no-cache',
'x-api-key': ('e''0''5''9''9''5''f''f''4''4''2''1''4''3''2''5''5''e''b''8''3''8''1''f''7''2''d''4''9''1''3''b''f''7''5''0''3''d''6''c'), # noqa: ISC001
}
br = browser()
req = Request(
mat_url,
headers=headers,
)
res = br.open(req)
return res.read()
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
description = ( description = (
'The Print Edition of WSJ. The Wall Street Journal is your source for breaking news, analysis and insights from the U.S. and ' 'The Print Edition of WSJ. The Wall Street Journal is your source '
"around the world, the world's leading business and finance publication." 'for breaking news, analysis and insights from the U.S. and '
'around the world, the world\'s leading business and finance publication.'
) )
language = 'en_US' language = 'en_US'
masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png' masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'
@ -23,17 +43,18 @@ class WSJ(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style', 'height', 'width'] remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True resolve_internal_links = True
simultaneous_downloads = 20
recipe_specific_options = { recipe_specific_options = {
'date': { 'date': {
'short': 'The date of the edition to download (YYYYMMDD format)\nOnly the past 6 editions will be available ', 'short': 'The date of the edition to download (YYYY-MM-DD format)\nOnly the past 6 editions will be available ',
'long': 'For example, 20240513' 'long': 'For example, 2024-05-13',
}, },
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600' 'default': '600',
} },
} }
extra_css = ''' extra_css = '''
@ -44,20 +65,35 @@ class WSJ(BasicNewsRecipe):
''' '''
remove_tags = [ remove_tags = [
dict(name='panel', attrs={'id':'summary-image'}), dict(name='panel', attrs={'id': 'summary-image'}),
dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'layout': 'inline'}),
dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='panel', attrs={'embed': 'inner-article-ad'}),
dict(name='span', attrs={'embed':'ticker'}), dict(name='span', attrs={'embed': 'ticker'}),
classes('lamrelated-articles-inset-panel'), classes('lamrelated-articles-inset-panel'),
dict(name='p', attrs={'id':[ dict(
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', name='p',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' attrs={
]}), 'id': [
'keywords',
'orig-pubdate-number',
'type',
'is-custom-flashline',
'grouphed',
'author-ids',
'article-manifest',
'body-extract',
'category',
'sub-category',
'socialhed',
'summary',
'deckline',
'article-flashline',
]
},
),
] ]
remove_tags_before = [ remove_tags_before = [dict(name='p', attrs={'id': 'orig-pubdate-string'})]
dict(name='p', attrs={'id':'orig-pubdate-string'})
]
def media_bucket(self, x): def media_bucket(self, x):
res = '?width=600' res = '?width=600'
@ -65,16 +101,24 @@ class WSJ(BasicNewsRecipe):
if w and isinstance(w, str): if w and isinstance(w, str):
res = '?width=' + w res = '?width=' + w
if x.get('type', '') == 'image': if x.get('type', '') == 'image':
if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: if (
x.get('subtype', '') == 'graphic'
or 'images.wsj.net' not in x['manifest-url']
):
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
) )
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
) )
if x.get('type', '') == 'video': if x.get('type', '') == 'video':
return '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format( return (
x['share_link'], x['thumbnail_url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
x['share_link'],
x['thumbnail_url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
)
) )
return return
@ -82,32 +126,32 @@ class WSJ(BasicNewsRecipe):
jpml = soup.find('jpml') jpml = soup.find('jpml')
if jpml: if jpml:
jpml.name = 'article' jpml.name = 'article'
h1 = soup.find('p', attrs={'id':'headline'}) h1 = soup.find('p', attrs={'id': 'headline'})
if h1: if h1:
h1.name = 'h1' h1.name = 'h1'
for h2 in soup.findAll('h2'): for h2 in soup.findAll('h2'):
h2.name = 'h4' h2.name = 'h4'
dt = soup.find('p', attrs={'id':'orig-pubdate-string'}) dt = soup.find('p', attrs={'id': 'orig-pubdate-string'})
read = soup.find('p', attrs={'id':'time-to-read'}) read = soup.find('p', attrs={'id': 'time-to-read'})
byl = soup.find('p', attrs={'id':'byline'}) byl = soup.find('p', attrs={'id': 'byline'})
fl = soup.find('p', attrs={'id':'flashline'}) fl = soup.find('p', attrs={'id': 'flashline'})
if dt and byl and read and fl: if dt and byl and read and fl:
dt.name = read.name = byl.name = fl.name = 'div' dt.name = read.name = byl.name = fl.name = 'div'
byl.insert(0, dt) byl.insert(0, dt)
byl.insert(0, read) byl.insert(0, read)
url = soup.find('p', attrs={'id':'share-link'}) url = soup.find('p', attrs={'id': 'share-link'})
if url: if url:
url.name = 'div' url.name = 'div'
url['title'] = self.tag_to_string(url).strip() url['title'] = self.tag_to_string(url).strip()
url.string = '' url.string = ''
panel = soup.find('panel', attrs={'id':'metadata'}) panel = soup.find('panel', attrs={'id': 'metadata'})
if panel: if panel:
buck = panel.find('p', attrs={'id':'media-bucket'}) buck = panel.find('p', attrs={'id': 'media-bucket'})
if buck: if buck:
data = json.loads(buck.string) data = json.loads(buck.string)
buck.extract() buck.extract()
i_lst = [self.media_bucket(x) for x in data['items']] i_lst = [self.media_bucket(x) for x in data['items']]
m_itm = soup.findAll('panel', attrs={'class':'media-item'}) m_itm = soup.findAll('panel', attrs={'class': 'media-item'})
if i_lst and m_itm: if i_lst and m_itm:
for x, y in list(zip_longest(m_itm, i_lst)): for x, y in list(zip_longest(m_itm, i_lst)):
x.insert_after(BeautifulSoup(y, 'html.parser')) x.insert_after(BeautifulSoup(y, 'html.parser'))
@ -124,13 +168,16 @@ class WSJ(BasicNewsRecipe):
import os import os
from contextlib import closing from contextlib import closing
from calibre import browser
from calibre.utils.img import save_cover_data_to from calibre.utils.img import save_cover_data_to
br = browser() br = browser()
raw = br.open('https://www.frontpages.com/the-wall-street-journal/') raw = br.open('https://www.frontpages.com/the-wall-street-journal/')
soup = BeautifulSoup(raw.read()) soup = BeautifulSoup(raw.read())
cu = 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src'] cu = (
self.report_progress(1, _('Downloading cover from %s')%cu) 'https://www.frontpages.com'
+ soup.find('img', attrs={'id': 'giornale-img'})['src']
)
self.report_progress(1, _('Downloading cover from %s') % cu)
with closing(br.open(cu, timeout=self.timeout)) as r: with closing(br.open(cu, timeout=self.timeout)) as r:
cdata = r.read() cdata = r.read()
cpath = os.path.join(self.output_dir, 'cover.jpg') cpath = os.path.join(self.output_dir, 'cover.jpg')
@ -138,69 +185,81 @@ class WSJ(BasicNewsRecipe):
self.cover_path = cpath self.cover_path = cpath
def get_browser(self, *args, **kw): def get_browser(self, *args, **kw):
kw['user_agent'] = 'okhttp/4.10.0'
br = BasicNewsRecipe.get_browser(self, *args, **kw) br = BasicNewsRecipe.get_browser(self, *args, **kw)
br.addheaders += [ br.addheaders += [
('Accept-Encoding', 'gzip'), ('apollographql-client-name', 'wsj-mobile-android-release'),
('cache-control', 'no-cache'),
('x-api-key', ('e''b''2''4''0''8''c''d''2''7''f''8''9''1''3''d''4''2''1''f''a''3''d''5''c''3''d''0''7''c''c''f''0''3''4''c''b''4''4''8')), # noqa: ISC001
] ]
return br return br
def parse_index(self): def parse_index(self):
index = 'https://bartender.mobile.dowjones.io' query = {
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) 'operationName': 'IssueQuery',
edit = [''.join([n for n in itm['key'] if n.isdigit()]) for itm in catalog['items'] if itm['type'] == 'ITP'][1:] 'variables': '{"publication":"WSJ","region":"US","masthead":"ITPNEXTGEN"}',
self.log('**Past Editions available :', ', '.join(edit)) 'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"d938226e7d1c1fff050e7d084c72179e2713dcf4736d3a442c618c55b896f847"}}',
}
url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
raw = self.index_to_soup(url, raw=True)
cat_data = json.loads(raw)['data']['mobileIssuesByMasthead']
edit = [x['datedLabel'] for x in cat_data][1:]
self.log('**Past Editions available : ' + ' | '.join(edit))
past_edition = self.recipe_specific_options.get('date') past_edition = self.recipe_specific_options.get('date')
for itm in catalog['items']: for itm in cat_data:
if past_edition and isinstance(past_edition, str): if past_edition and isinstance(past_edition, str):
if past_edition in itm['key']: if past_edition in itm['publishedDateUtc']:
manifest = itm['manifest'] self.timefmt = ' [' + itm['datedLabel']
date = itm['date'] sections_ = itm['sections']
break break
elif itm['type'] == 'ITP': self.timefmt = f' [{itm["datedLabel"]}]'
manifest = itm['manifest'] sections_ = itm['sections']
date = itm['date']
break break
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) self.log('Downloading ', self.timefmt)
dt_ = dt.strftime('%b %d, %Y')
self.log('Downloading ', dt_)
self.timefmt = ' [' + dt_ + ']'
feeds = [] feeds = []
manif = json.loads(self.index_to_soup(index + manifest, raw=True)) for sec in sections_[:-1]:
for itm in manif['items']: section = sec['label']
for k, v in itm.items():
if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' in section:
if not (dt.day in {1, 2, 3, 4, 5, 6, 7} and dt.weekday() == 5):
continue
self.log('Loading Magazine section')
self.log(section) self.log(section)
cont_id = sec['key']
query = {
'operationName': 'SectionQuery',
'variables': '{{"id":"{}"}}'.format(cont_id),
'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"207fe93376f379bf223ed2734cf9313a28291293366a803db923666fa6b45026"}}',
}
sec_url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
sec_raw = self.index_to_soup(sec_url, raw=True)
sec_data = json.loads(sec_raw)['data']['summaryCollectionContent'][
'collectionItems'
]
articles = [] articles = []
sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) for art in sec_data:
data = sec_parse['articles'] for arts in art['collectionItems']:
for art in data: mobi = arts['content']['mobileSummary']
title = data[art]['headline'] title = mobi['headline']['text']
desc = data[art]['summary'] desc = mobi['description']['content']['text']
url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename'] art_id = arts['id']
self.log(' ', title, '\n\t', desc) self.log(' ', title, '\n\t', desc)
articles.append({'title': title, 'description':desc, 'url': url}) art_cont = get_article(art_id)
pt = PersistentTemporaryFile('.html')
pt.write(art_cont)
pt.close()
url = 'file:///' + pt.name
articles.append({'title': title, 'description': desc, 'url': url})
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
def preprocess_raw_html(self, raw, url):
return BeautifulSoup(raw).prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('div', attrs={'id':'share-link'}) lnk = soup.find('div', attrs={'id': 'share-link'})
if lnk: if lnk:
article.url = lnk['title'] article.url = lnk['title']

View File

@ -2,11 +2,32 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
import json import json
from itertools import zip_longest from itertools import zip_longest
from urllib.parse import quote, urlencode
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
def get_article(article_id):
from mechanize import Request
mat_url = 'https://mats.mobile.dowjones.io/translate/' + article_id + '/jpml'
headers = {
'User-Agent': 'okhttp/4.10.0',
'Accept-Encoding': 'gzip',
'Cache-Control': 'no-cache',
'x-api-key': ('e''0''5''9''9''5''f''f''4''4''2''1''4''3''2''5''5''e''b''8''3''8''1''f''7''2''d''4''9''1''3''b''f''7''5''0''3''d''6''c'), # noqa: ISC001
}
br = browser()
req = Request(
mat_url,
headers=headers,
)
res = br.open(req)
return res.read()
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
title = 'WSJ. Magazine' title = 'WSJ. Magazine'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
@ -21,13 +42,14 @@ class WSJ(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style', 'height', 'width'] remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True resolve_internal_links = True
simultaneous_downloads = 20
recipe_specific_options = { recipe_specific_options = {
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600' 'default': '600',
} },
} }
extra_css = ''' extra_css = '''
@ -38,20 +60,35 @@ class WSJ(BasicNewsRecipe):
''' '''
remove_tags = [ remove_tags = [
dict(name='panel', attrs={'id':'summary-image'}), dict(name='panel', attrs={'id': 'summary-image'}),
dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'layout': 'inline'}),
dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='panel', attrs={'embed': 'inner-article-ad'}),
dict(name='span', attrs={'embed':'ticker'}), dict(name='span', attrs={'embed': 'ticker'}),
classes('lamrelated-articles-inset-panel'), classes('lamrelated-articles-inset-panel'),
dict(name='p', attrs={'id':[ dict(
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', name='p',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' attrs={
]}), 'id': [
'keywords',
'orig-pubdate-number',
'type',
'is-custom-flashline',
'grouphed',
'author-ids',
'article-manifest',
'body-extract',
'category',
'sub-category',
'socialhed',
'summary',
'deckline',
'article-flashline',
]
},
),
] ]
remove_tags_before = [ remove_tags_before = [dict(name='p', attrs={'id': 'orig-pubdate-string'})]
dict(name='p', attrs={'id':'orig-pubdate-string'})
]
def media_bucket(self, x): def media_bucket(self, x):
res = '?width=600' res = '?width=600'
@ -59,16 +96,24 @@ class WSJ(BasicNewsRecipe):
if w and isinstance(w, str): if w and isinstance(w, str):
res = '?width=' + w res = '?width=' + w
if x.get('type', '') == 'image': if x.get('type', '') == 'image':
if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: if (
x.get('subtype', '') == 'graphic'
or 'images.wsj.net' not in x['manifest-url']
):
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
) )
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
) )
if x.get('type', '') == 'video': if x.get('type', '') == 'video':
return '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format( return (
x['share_link'], x['thumbnail_url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
x['share_link'],
x['thumbnail_url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
)
) )
return return
@ -76,32 +121,32 @@ class WSJ(BasicNewsRecipe):
jpml = soup.find('jpml') jpml = soup.find('jpml')
if jpml: if jpml:
jpml.name = 'article' jpml.name = 'article'
h1 = soup.find('p', attrs={'id':'headline'}) h1 = soup.find('p', attrs={'id': 'headline'})
if h1: if h1:
h1.name = 'h1' h1.name = 'h1'
for h2 in soup.findAll('h2'): for h2 in soup.findAll('h2'):
h2.name = 'h4' h2.name = 'h4'
dt = soup.find('p', attrs={'id':'orig-pubdate-string'}) dt = soup.find('p', attrs={'id': 'orig-pubdate-string'})
read = soup.find('p', attrs={'id':'time-to-read'}) read = soup.find('p', attrs={'id': 'time-to-read'})
byl = soup.find('p', attrs={'id':'byline'}) byl = soup.find('p', attrs={'id': 'byline'})
fl = soup.find('p', attrs={'id':'flashline'}) fl = soup.find('p', attrs={'id': 'flashline'})
if dt and byl and read and fl: if dt and byl and read and fl:
dt.name = read.name = byl.name = fl.name = 'div' dt.name = read.name = byl.name = fl.name = 'div'
byl.insert(0, dt) byl.insert(0, dt)
byl.insert(0, read) byl.insert(0, read)
url = soup.find('p', attrs={'id':'share-link'}) url = soup.find('p', attrs={'id': 'share-link'})
if url: if url:
url.name = 'div' url.name = 'div'
url['title'] = self.tag_to_string(url).strip() url['title'] = self.tag_to_string(url).strip()
url.string = '' url.string = ''
panel = soup.find('panel', attrs={'id':'metadata'}) panel = soup.find('panel', attrs={'id': 'metadata'})
if panel: if panel:
buck = panel.find('p', attrs={'id':'media-bucket'}) buck = panel.find('p', attrs={'id': 'media-bucket'})
if buck: if buck:
data = json.loads(buck.string) data = json.loads(buck.string)
buck.extract() buck.extract()
i_lst = [self.media_bucket(x) for x in data['items']] i_lst = [self.media_bucket(x) for x in data['items']]
m_itm = soup.findAll('panel', attrs={'class':'media-item'}) m_itm = soup.findAll('panel', attrs={'class': 'media-item'})
if i_lst and m_itm: if i_lst and m_itm:
for x, y in list(zip_longest(m_itm, i_lst)): for x, y in list(zip_longest(m_itm, i_lst)):
x.insert_after(BeautifulSoup(y, 'html.parser')) x.insert_after(BeautifulSoup(y, 'html.parser'))
@ -113,51 +158,69 @@ class WSJ(BasicNewsRecipe):
return soup return soup
def get_browser(self, *args, **kw): def get_browser(self, *args, **kw):
kw['user_agent'] = 'okhttp/4.10.0'
br = BasicNewsRecipe.get_browser(self, *args, **kw) br = BasicNewsRecipe.get_browser(self, *args, **kw)
br.addheaders += [ br.addheaders += [
('Accept-Encoding', 'gzip'), ('apollographql-client-name', 'wsj-mobile-android-release'),
('cache-control', 'no-cache'),
('x-api-key', ('e''b''2''4''0''8''c''d''2''7''f''8''9''1''3''d''4''2''1''f''a''3''d''5''c''3''d''0''7''c''c''f''0''3''4''c''b''4''4''8')), # noqa: ISC001
] ]
return br return br
def parse_index(self): def parse_index(self):
index = 'https://bartender.mobile.dowjones.io' query = {
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) 'operationName': 'IssueQuery',
for itm in catalog['items']: 'variables': '{"publication":"WSJ","region":"US","masthead":"ITPNEXTGEN"}',
if itm['type'] == 'ITP': 'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"d938226e7d1c1fff050e7d084c72179e2713dcf4736d3a442c618c55b896f847"}}',
manifest = itm['manifest'] }
url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
raw = self.index_to_soup(url, raw=True)
cat_data = json.loads(raw)['data']['mobileIssuesByMasthead']
for itm in cat_data:
sections_ = itm['sections']
break break
feeds = [] feeds = []
manif = json.loads(self.index_to_soup(index + manifest, raw=True)) sec = sections_[-1]
for itm in manif['items']: section = sec['label']
for k, v in itm.items():
if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' not in section:
continue
self.log(section) self.log(section)
cont_id = sec['key']
query = {
'operationName': 'SectionQuery',
'variables': '{{"id":"{}"}}'.format(cont_id),
'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"207fe93376f379bf223ed2734cf9313a28291293366a803db923666fa6b45026"}}',
}
sec_url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
sec_raw = self.index_to_soup(sec_url, raw=True)
sec_data = json.loads(sec_raw)['data']['summaryCollectionContent'][
'collectionItems'
]
articles = [] articles = []
sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) for art in sec_data:
data = sec_parse['articles'] for arts in art['collectionItems']:
for art in data: mobi = arts['content']['mobileSummary']
title = data[art]['headline'] title = mobi['headline']['text']
desc = data[art]['summary'] desc = mobi['description']['content']['text']
url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename'] art_id = arts['id']
self.log(' ', title, '\n\t', desc) self.log(' ', title, '\n\t', desc)
articles.append({'title': title, 'description':desc, 'url': url}) art_cont = get_article(art_id)
pt = PersistentTemporaryFile('.html')
pt.write(art_cont)
pt.close()
url = 'file:///' + pt.name
articles.append({'title': title, 'description': desc, 'url': url})
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
def preprocess_raw_html(self, raw, url):
return BeautifulSoup(raw).prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('div', attrs={'id':'share-link'}) lnk = soup.find('div', attrs={'id': 'share-link'})
if lnk: if lnk:
article.url = lnk['title'] article.url = lnk['title']