make subscription for WSJ optional as recipe currently uses archive.is

This commit is contained in:
Kovid Goyal 2023-10-18 06:41:13 +05:30
parent 9d7c20c267
commit d62dab1e2e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 13 additions and 13 deletions

View File

@ -10,7 +10,8 @@ from base64 import standard_b64encode
from datetime import date, timedelta
from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select
try:
@ -22,10 +23,8 @@ try:
except ImportError:
from urllib import quote
from calibre.scraper.simple import read_url
from calibre.ptempfile import PersistentTemporaryFile
needs_subscription = True
needs_subscription = 'optional'
def substring_classes(classes):
@ -84,6 +83,7 @@ class WSJ(BasicNewsRecipe):
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
from calibre.scraper.simple import read_url
br = self.get_browser()
br.set_handle_redirect(False)
try:
@ -96,7 +96,7 @@ class WSJ(BasicNewsRecipe):
pt.close()
return pt.name
def preprocess_html(self, soup):
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}):
img['src'] = img['old-src']
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br
if needs_subscription:
if False and needs_subscription: # disabled as we currently use archive.is
def get_browser(self, *a, **kw):
from pprint import pprint
pprint

View File

@ -10,7 +10,8 @@ from base64 import standard_b64encode
from datetime import date, timedelta
from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select
try:
@ -22,8 +23,6 @@ try:
except ImportError:
from urllib import quote
from calibre.scraper.simple import read_url
from calibre.ptempfile import PersistentTemporaryFile
needs_subscription = False
@ -67,13 +66,14 @@ class WSJ(BasicNewsRecipe):
#big-top-caption { font-size:small; text-align:center; }
[data-type:"tagline"] { font-style:italic; color:#202020; }
'''
keep_only_tags = [
dict(name=['h1', 'h2']),
dict(attrs={'aria-describedby':'big-top-caption'}),
dict(attrs={'id':'big-top-caption'}),
dict(name='article')
]
remove_tags = [
dict(name=['button', 'svg', 'ufc-follow-author-widget']),
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}),
@ -83,6 +83,7 @@ class WSJ(BasicNewsRecipe):
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
from calibre.scraper.simple import read_url
br = self.get_browser()
br.set_handle_redirect(False)
try:
@ -95,7 +96,7 @@ class WSJ(BasicNewsRecipe):
pt.close()
return pt.name
def preprocess_html(self, soup):
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'old-src':True}):
img['src'] = img['old-src']
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
@ -114,7 +115,6 @@ class WSJ(BasicNewsRecipe):
h2.extract()
return soup
# login {{{
def get_browser_for_wsj(self, *a, **kw):
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br
if needs_subscription:
if False and needs_subscription: # disabled as we currently use archive.is
def get_browser(self, *a, **kw):
from pprint import pprint
pprint