make subscription for WSJ optional as recipe currently uses archive.is

This commit is contained in:
Kovid Goyal 2023-10-18 06:41:13 +05:30
parent 9d7c20c267
commit d62dab1e2e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 13 additions and 13 deletions

View File

@ -10,7 +10,8 @@ from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try: try:
@ -22,10 +23,8 @@ try:
except ImportError: except ImportError:
from urllib import quote from urllib import quote
from calibre.scraper.simple import read_url
from calibre.ptempfile import PersistentTemporaryFile
needs_subscription = True needs_subscription = 'optional'
def substring_classes(classes): def substring_classes(classes):
@ -84,6 +83,7 @@ class WSJ(BasicNewsRecipe):
articles_are_obfuscated = True articles_are_obfuscated = True
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
from calibre.scraper.simple import read_url
br = self.get_browser() br = self.get_browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
try: try:
@ -96,7 +96,7 @@ class WSJ(BasicNewsRecipe):
pt.close() pt.close()
return pt.name return pt.name
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}): for img in soup.findAll('img', attrs={'old-src':True}):
img['src'] = img['old-src'] img['src'] = img['old-src']
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}): for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
br.set_cookie('ccpaApplies', 'false', '.wsj.com') br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br return br
if needs_subscription: if False and needs_subscription: # disabled as we currently use archive.is
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
from pprint import pprint from pprint import pprint
pprint pprint

View File

@ -10,7 +10,8 @@ from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try: try:
@ -22,8 +23,6 @@ try:
except ImportError: except ImportError:
from urllib import quote from urllib import quote
from calibre.scraper.simple import read_url
from calibre.ptempfile import PersistentTemporaryFile
needs_subscription = False needs_subscription = False
@ -67,13 +66,14 @@ class WSJ(BasicNewsRecipe):
#big-top-caption { font-size:small; text-align:center; } #big-top-caption { font-size:small; text-align:center; }
[data-type:"tagline"] { font-style:italic; color:#202020; } [data-type:"tagline"] { font-style:italic; color:#202020; }
''' '''
keep_only_tags = [ keep_only_tags = [
dict(name=['h1', 'h2']), dict(name=['h1', 'h2']),
dict(attrs={'aria-describedby':'big-top-caption'}), dict(attrs={'aria-describedby':'big-top-caption'}),
dict(attrs={'id':'big-top-caption'}), dict(attrs={'id':'big-top-caption'}),
dict(name='article') dict(name='article')
] ]
remove_tags = [ remove_tags = [
dict(name=['button', 'svg', 'ufc-follow-author-widget']), dict(name=['button', 'svg', 'ufc-follow-author-widget']),
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}), dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}),
@ -83,6 +83,7 @@ class WSJ(BasicNewsRecipe):
articles_are_obfuscated = True articles_are_obfuscated = True
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
from calibre.scraper.simple import read_url
br = self.get_browser() br = self.get_browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
try: try:
@ -95,7 +96,7 @@ class WSJ(BasicNewsRecipe):
pt.close() pt.close()
return pt.name return pt.name
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}): for img in soup.findAll('img', attrs={'old-src':True}):
img['src'] = img['old-src'] img['src'] = img['old-src']
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}): for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
@ -114,7 +115,6 @@ class WSJ(BasicNewsRecipe):
h2.extract() h2.extract()
return soup return soup
# login {{{ # login {{{
def get_browser_for_wsj(self, *a, **kw): def get_browser_for_wsj(self, *a, **kw):
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
br.set_cookie('ccpaApplies', 'false', '.wsj.com') br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br return br
if needs_subscription: if False and needs_subscription: # disabled as we currently use archive.is
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
from pprint import pprint from pprint import pprint
pprint pprint