This commit is contained in:
Kovid Goyal 2023-10-20 09:38:31 +05:30
commit bf4732a4ff
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 41 additions and 114 deletions

View File

@ -10,8 +10,7 @@ from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try: try:
@ -24,7 +23,7 @@ except ImportError:
from urllib import quote from urllib import quote
needs_subscription = 'optional' needs_subscription = True
def substring_classes(classes): def substring_classes(classes):
@ -60,68 +59,33 @@ class WSJ(BasicNewsRecipe):
needs_subscription = needs_subscription needs_subscription = needs_subscription
WSJ_ITP = 'https://www.wsj.com/print-edition/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
storage = []
extra_css = ''' extra_css = '''
#big-top-caption { font-size:small; text-align:center; } .imageCaption{font-size:small; text-align:center;}
[data-type:"tagline"], em { font-style:italic; color:#202020; } .sub-head{font-style:italic; color:#404040;}
.auth { font-size:small; } .bylineWrap{font-size:small; text-align:left;}
''' '''
keep_only_tags = [ keep_only_tags = [
dict(name=['h1', 'h2']), dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}),
dict(attrs={'aria-describedby':'big-top-caption'}), dict(name='main'),
dict(attrs={'id':'big-top-caption'}),
dict(name='article')
] ]
remove_tags = [ remove_tags = [
dict(name=['button', 'svg', 'ufc-follow-author-widget']), classes(
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}), 'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools'
dict(attrs={'data-type':'inset'}), ' podcast--iframe dynamic-inset-overflow-button snippet-logo'),
dict(attrs={'id':lambda x: x and x.startswith(('wrapper-INLINE', 'audio-tag-inner-audio-'))}) dict(role=["toolbar", "complementary"]),
dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}),
dict(name='amp-iframe'), # interactive graphics
] ]
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
from calibre.scraper.simple import read_url
br = self.get_browser()
br.set_handle_redirect(False)
try:
br.open(url)
except Exception as e:
url = e.hdrs.get('location')
raw = read_url(self.storage, 'https://archive.is/latest/' + url)
pt = PersistentTemporaryFile('.html')
pt.write(raw.encode('utf-8'))
pt.close()
return pt.name
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}): for by in soup.findAll(**classes('bylineWrap')):
img['src'] = img['old-src'] for p in by.findAll('p'):
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}): p.name = 'span'
p.name = 'p' h2 = soup.find('h2', attrs={'class':'sub-head'})
for a in soup.findAll('a', href=True): if h2:
a['href'] = 'http' + a['href'].split('http')[-1] h2.name = 'p'
for fig in soup.findAll('figure'):
if fig.find('video'):
fig.extract()
for figc in soup.findAll('figcaption'):
figc['id'] = 'big-top-caption'
if name:= soup.find('h2', attrs={'itemprop':'name'}):
name.extract()
for h2 in soup.findAll('h2'):
if self.tag_to_string(h2).startswith('What to Read Next'):
h2.extract()
for ph in soup.findAll('a', attrs={'data-type':['phrase', 'link']}):
if div := ph.findParent('div'):
div.name = 'span'
for auth in soup.findAll('a', attrs={'aria-label': lambda x: x and x.startswith('Author page')}):
if div := auth.find_previous_sibling('div'):
div.name = 'span'
if parent := auth.findParent('div'):
parent['class'] = 'auth'
return soup return soup
# login {{{ # login {{{
@ -133,7 +97,7 @@ class WSJ(BasicNewsRecipe):
br.set_cookie('ccpaApplies', 'false', '.wsj.com') br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br return br
if False and needs_subscription: # disabled as we currently use archive.is if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
from pprint import pprint from pprint import pprint
pprint pprint
@ -304,6 +268,6 @@ class WSJ(BasicNewsRecipe):
return [ return [
('Testing', [ ('Testing', [
{'title': 'Subscriber Article', {'title': 'Subscriber Article',
'url': self.abs_wsj_url('https://www.wsj.com/articles/remington-gun-call-of-duty-video-game-93059a66')}, 'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
]), ]),
] ]

View File

@ -10,8 +10,7 @@ from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try: try:
@ -60,69 +59,33 @@ class WSJ(BasicNewsRecipe):
needs_subscription = needs_subscription needs_subscription = needs_subscription
WSJ_ITP = 'https://www.wsj.com/print-edition/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
storage = []
extra_css = ''' extra_css = '''
#big-top-caption { font-size:small; text-align:center; } .imageCaption{font-size:small; text-align:center;}
[data-type:"tagline"] { font-style:italic; color:#202020; } .sub-head{font-style:italic; color:#404040;}
[data-type:"tagline"], em { font-style:italic; color:#202020; } .bylineWrap{font-size:small; text-align:left;}
.auth { font-size:small; }
''' '''
keep_only_tags = [ keep_only_tags = [
dict(name=['h1', 'h2']), dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}),
dict(attrs={'aria-describedby':'big-top-caption'}), dict(name='main'),
dict(attrs={'id':'big-top-caption'}),
dict(name='article')
] ]
remove_tags = [ remove_tags = [
dict(name=['button', 'svg', 'ufc-follow-author-widget']), classes(
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}), 'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools'
dict(attrs={'data-type':'inset'}), ' podcast--iframe dynamic-inset-overflow-button snippet-logo'),
dict(attrs={'id':lambda x: x and x.startswith(('wrapper-INLINE', 'audio-tag-inner-audio-'))}) dict(role=["toolbar", "complementary"]),
dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}),
dict(name='amp-iframe'), # interactive graphics
] ]
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
from calibre.scraper.simple import read_url
br = self.get_browser()
br.set_handle_redirect(False)
try:
br.open(url)
except Exception as e:
url = e.hdrs.get('location')
raw = read_url(self.storage, 'https://archive.is/latest/' + url)
pt = PersistentTemporaryFile('.html')
pt.write(raw.encode('utf-8'))
pt.close()
return pt.name
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}): for by in soup.findAll(**classes('bylineWrap')):
img['src'] = img['old-src'] for p in by.findAll('p'):
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}): p.name = 'span'
p.name = 'p' h2 = soup.find('h2', attrs={'class':'sub-head'})
for a in soup.findAll('a', href=True): if h2:
a['href'] = 'http' + a['href'].split('http')[-1] h2.name = 'p'
for fig in soup.findAll('figure'):
if fig.find('video'):
fig.extract()
for figc in soup.findAll('figcaption'):
figc['id'] = 'big-top-caption'
if name:= soup.find('h2', attrs={'itemprop':'name'}):
name.extract()
for h2 in soup.findAll('h2'):
if self.tag_to_string(h2).startswith('What to Read Next'):
h2.extract()
for ph in soup.findAll('a', attrs={'data-type':['phrase', 'link']}):
if div := ph.findParent('div'):
div.name = 'span'
for auth in soup.findAll('a', attrs={'aria-label': lambda x: x and x.startswith('Author page')}):
if div := auth.find_previous_sibling('div'):
div.name = 'span'
if parent := auth.findParent('div'):
parent['class'] = 'auth'
return soup return soup
# login {{{ # login {{{
@ -134,7 +97,7 @@ class WSJ(BasicNewsRecipe):
br.set_cookie('ccpaApplies', 'false', '.wsj.com') br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br return br
if False and needs_subscription: # disabled as we currently use archive.is if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
from pprint import pprint from pprint import pprint
pprint pprint
@ -305,6 +268,6 @@ class WSJ(BasicNewsRecipe):
return [ return [
('Testing', [ ('Testing', [
{'title': 'Subscriber Article', {'title': 'Subscriber Article',
'url': self.abs_wsj_url('https://www.wsj.com/articles/remington-gun-call-of-duty-video-game-93059a66')}, 'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
]), ]),
] ]