This commit is contained in:
Kovid Goyal 2023-10-23 12:38:45 +05:30
commit b9ac7c7ed7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 121 additions and 74 deletions

View File

@ -132,4 +132,6 @@ class ft(BasicNewsRecipe):
for con in soup.findAll(attrs={'class':'n-content-layout__slot'}): for con in soup.findAll(attrs={'class':'n-content-layout__slot'}):
if con.find('figure'): if con.find('figure'):
con['id'] = 'fig' con['id'] = 'fig'
if h3 := soup.find(**classes('o-topper__standfirst')):
h3.name = 'h3'
return soup return soup

View File

@ -98,7 +98,7 @@ class LiveMint(BasicNewsRecipe):
classes( classes(
'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight' 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight'
' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot' ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot'
' datePublish sepStory premiumSlider moreStory' ' datePublish sepStory premiumSlider moreStory Joinus'
) )
] ]
@ -149,6 +149,9 @@ class LiveMint(BasicNewsRecipe):
span.extract() span.extract()
for img in soup.findAll('img', attrs={'data-src': True}): for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src'] img['src'] = img['data-src']
if wa := soup.find(**classes('autobacklink-topic')):
if p := wa.findParent('p'):
p.extract()
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):

View File

@ -61,9 +61,10 @@ class projectsynd(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}): for img in soup.findAll('img', attrs={'old-src':True}):
img['src'] = img['old-src'].replace('medium', 'xlarge') img['src'] = img['old-src'].replace('medium', 'xlarge')
if abst := soup.find(attrs={'itemprop':'abstract'}).find('div'): if abst := soup.find(attrs={'itemprop':'abstract'}):
abst.name = 'p' if div := abst.find('div'):
abst['class'] = 'sub' div.name = 'p'
div['class'] = 'sub'
for div in soup.findAll('div', attrs={'data-line-id':True}): for div in soup.findAll('div', attrs={'data-line-id':True}):
div.name = 'p' div.name = 'p'
for a in soup.findAll('a', href=True): for a in soup.findAll('a', href=True):

View File

@ -10,7 +10,8 @@ from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try: try:
@ -23,21 +24,7 @@ except ImportError:
from urllib import quote from urllib import quote
needs_subscription = True needs_subscription = 'optional'
def substring_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if x in candidate:
return True
return False
return {'attrs': {'class': matcher}}
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
@ -59,33 +46,70 @@ class WSJ(BasicNewsRecipe):
needs_subscription = needs_subscription needs_subscription = needs_subscription
WSJ_ITP = 'https://www.wsj.com/print-edition/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
storage = []
extra_css = ''' extra_css = '''
.imageCaption{font-size:small; text-align:center;} #big-top-caption { font-size:small; text-align:center; }
.sub-head{font-style:italic; color:#404040;} [data-type:"tagline"], em { font-style:italic; color:#202020; }
.bylineWrap{font-size:small; text-align:left;} .auth { font-size:small; }
''' '''
keep_only_tags = [ keep_only_tags = [
dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}), dict(name=['h1', 'h2']),
dict(name='main'), dict(attrs={'aria-describedby':'big-top-caption'}),
dict(attrs={'id':'big-top-caption'}),
dict(name='article')
] ]
remove_tags = [ remove_tags = [
classes( dict(name=['button', 'svg', 'ufc-follow-author-widget', 'old-script']),
'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools' dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next', 'Utility Bar', 'Conversation']}),
' podcast--iframe dynamic-inset-overflow-button snippet-logo'), dict(attrs={'data-type':'inset'}),
dict(role=["toolbar", "complementary"]), dict(attrs={'data-spotim-app':'conversation'}),
dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}), dict(attrs={'data-spot-im-class':['message-text', 'conversation-root']}),
dict(name='amp-iframe'), # interactive graphics dict(attrs={'id':lambda x: x and x.startswith(('comments_sector', wrapper-INLINE', 'audio-tag-inner-audio-'))}),
] ]
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
from calibre.scraper.simple import read_url
br = self.get_browser()
br.set_handle_redirect(False)
try:
br.open(url)
except Exception as e:
url = e.hdrs.get('location')
raw = read_url(self.storage, 'https://archive.is/latest/' + url)
pt = PersistentTemporaryFile('.html')
pt.write(raw.encode('utf-8'))
pt.close()
return pt.name
def preprocess_html(self, soup): def preprocess_html(self, soup):
for by in soup.findAll(**classes('bylineWrap')): for img in soup.findAll('img', attrs={'old-src':True}):
for p in by.findAll('p'): img['src'] = img['old-src']
p.name = 'span' for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
h2 = soup.find('h2', attrs={'class':'sub-head'}) p.name = 'p'
if h2: for a in soup.findAll('a', href=True):
h2.name = 'p' a['href'] = 'http' + a['href'].split('http')[-1]
for fig in soup.findAll('figure'):
if fig.find('video'):
fig.extract()
for figc in soup.findAll('figcaption'):
figc['id'] = 'big-top-caption'
if name:= soup.find('h2', attrs={'itemprop':'name'}):
name.extract()
for h2 in soup.findAll('h2'):
if self.tag_to_string(h2).startswith(('What to Read Next', 'Conversation')):
h2.extract()
for ph in soup.findAll('a', attrs={'data-type':['phrase', 'link']}):
if div := ph.findParent('div'):
div.name = 'span'
for auth in soup.findAll('a', attrs={'aria-label': lambda x: x and x.startswith('Author page')}):
if div := auth.find_previous_sibling('div'):
div.name = 'span'
if parent := auth.findParent('div'):
parent['class'] = 'auth'
return soup return soup
# login {{{ # login {{{
@ -97,7 +121,7 @@ class WSJ(BasicNewsRecipe):
br.set_cookie('ccpaApplies', 'false', '.wsj.com') br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br return br
if needs_subscription: if False and needs_subscription: # disabled as we currently use archive.is
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
from pprint import pprint from pprint import pprint
pprint pprint
@ -268,6 +292,6 @@ class WSJ(BasicNewsRecipe):
return [ return [
('Testing', [ ('Testing', [
{'title': 'Subscriber Article', {'title': 'Subscriber Article',
'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')}, 'url': self.abs_wsj_url('https://www.wsj.com/articles/remington-gun-call-of-duty-video-game-93059a66')},
]), ]),
] ]

View File

@ -10,7 +10,7 @@ from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try: try:
@ -22,23 +22,12 @@ try:
except ImportError: except ImportError:
from urllib import quote from urllib import quote
from calibre.scraper.simple import read_url
from calibre.ptempfile import PersistentTemporaryFile
needs_subscription = False needs_subscription = False
def substring_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if x in candidate:
return True
return False
return {'attrs': {'class': matcher}}
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
if needs_subscription: if needs_subscription:
@ -59,35 +48,63 @@ class WSJ(BasicNewsRecipe):
needs_subscription = needs_subscription needs_subscription = needs_subscription
WSJ_ITP = 'https://www.wsj.com/print-edition/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
extra_css = ''' storage = []
.imageCaption{font-size:small; text-align:center;}
.sub-head{font-style:italic; color:#404040;}
.bylineWrap{font-size:small; text-align:left;}
'''
extra_css = '''
#big-top-caption { font-size:small; text-align:center; }
[data-type:"tagline"] { font-style:italic; color:#202020; }
'''
keep_only_tags = [ keep_only_tags = [
dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}), dict(name=['h1', 'h2']),
dict(name='main'), dict(attrs={'aria-describedby':'big-top-caption'}),
dict(attrs={'id':'big-top-caption'}),
dict(name='article')
] ]
remove_tags = [ remove_tags = [
classes( dict(name=['button', 'svg', 'ufc-follow-author-widget', 'old-script']),
'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools' dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next', 'Utility Bar', 'Conversation']}),
' podcast--iframe dynamic-inset-overflow-button snippet-logo'), dict(attrs={'data-type':'inset'}),
dict(role=["toolbar", "complementary"]), dict(attrs={'data-spotim-app':'conversation'}),
dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}), dict(attrs={'data-spot-im-class':['message-text', 'conversation-root']}),
dict(name='amp-iframe'), # interactive graphics dict(attrs={'id':lambda x: x and x.startswith(('comments_sector', wrapper-INLINE', 'audio-tag-inner-audio-'))}),
] ]
def preprocess_html(self, soup): articles_are_obfuscated = True
for by in soup.findAll(**classes('bylineWrap')): def get_obfuscated_article(self, url):
for p in by.findAll('p'): br = self.get_browser()
p.name = 'span' br.set_handle_redirect(False)
h2 = soup.find('h2', attrs={'class':'sub-head'}) try:
if h2: br.open(url)
h2.name = 'p' except Exception as e:
url = e.hdrs.get('location')
raw = read_url(self.storage, 'https://archive.is/latest/' + url)
pt = PersistentTemporaryFile('.html')
pt.write(raw.encode('utf-8'))
pt.close()
return pt.name
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}):
img['src'] = img['old-src']
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
p.name = 'p'
for a in soup.findAll('a', href=True):
a['href'] = 'http' + a['href'].split('http')[-1]
for fig in soup.findAll('figure'):
if fig.find('video'):
fig.extract()
for figc in soup.findAll('figcaption'):
figc['id'] = 'big-top-caption'
if name:= soup.find('h2', attrs={'itemprop':'name'}):
name.extract()
for h2 in soup.findAll('h2'):
if self.tag_to_string(h2).startswith(('What to Read Next', 'Conversation')):
h2.extract()
return soup return soup
# login {{{ # login {{{
def get_browser_for_wsj(self, *a, **kw): def get_browser_for_wsj(self, *a, **kw):
@ -268,6 +285,6 @@ class WSJ(BasicNewsRecipe):
return [ return [
('Testing', [ ('Testing', [
{'title': 'Subscriber Article', {'title': 'Subscriber Article',
'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')}, 'url': self.abs_wsj_url('https://www.wsj.com/articles/remington-gun-call-of-duty-video-game-93059a66')},
]), ]),
] ]