mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
make subscription for WSJ optional as recipe currently uses archive.is
This commit is contained in:
parent
9d7c20c267
commit
d62dab1e2e
@ -10,7 +10,8 @@ from base64 import standard_b64encode
|
||||
from datetime import date, timedelta
|
||||
from mechanize import Request
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from css_selectors import Select
|
||||
|
||||
try:
|
||||
@ -22,10 +23,8 @@ try:
|
||||
except ImportError:
|
||||
from urllib import quote
|
||||
|
||||
from calibre.scraper.simple import read_url
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
needs_subscription = True
|
||||
needs_subscription = 'optional'
|
||||
|
||||
|
||||
def substring_classes(classes):
|
||||
@ -84,6 +83,7 @@ class WSJ(BasicNewsRecipe):
|
||||
|
||||
articles_are_obfuscated = True
|
||||
def get_obfuscated_article(self, url):
|
||||
from calibre.scraper.simple import read_url
|
||||
br = self.get_browser()
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
@ -96,7 +96,7 @@ class WSJ(BasicNewsRecipe):
|
||||
pt.close()
|
||||
return pt.name
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
def preprocess_html(self, soup):
|
||||
for img in soup.findAll('img', attrs={'old-src':True}):
|
||||
img['src'] = img['old-src']
|
||||
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
|
||||
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
|
||||
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
|
||||
return br
|
||||
|
||||
if needs_subscription:
|
||||
if False and needs_subscription: # disabled as we currently use archive.is
|
||||
def get_browser(self, *a, **kw):
|
||||
from pprint import pprint
|
||||
pprint
|
||||
|
@ -10,7 +10,8 @@ from base64 import standard_b64encode
|
||||
from datetime import date, timedelta
|
||||
from mechanize import Request
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from css_selectors import Select
|
||||
|
||||
try:
|
||||
@ -22,8 +23,6 @@ try:
|
||||
except ImportError:
|
||||
from urllib import quote
|
||||
|
||||
from calibre.scraper.simple import read_url
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
needs_subscription = False
|
||||
|
||||
@ -67,13 +66,14 @@ class WSJ(BasicNewsRecipe):
|
||||
#big-top-caption { font-size:small; text-align:center; }
|
||||
[data-type:"tagline"] { font-style:italic; color:#202020; }
|
||||
'''
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name=['h1', 'h2']),
|
||||
dict(attrs={'aria-describedby':'big-top-caption'}),
|
||||
dict(attrs={'id':'big-top-caption'}),
|
||||
dict(name='article')
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['button', 'svg', 'ufc-follow-author-widget']),
|
||||
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}),
|
||||
@ -83,6 +83,7 @@ class WSJ(BasicNewsRecipe):
|
||||
|
||||
articles_are_obfuscated = True
|
||||
def get_obfuscated_article(self, url):
|
||||
from calibre.scraper.simple import read_url
|
||||
br = self.get_browser()
|
||||
br.set_handle_redirect(False)
|
||||
try:
|
||||
@ -95,7 +96,7 @@ class WSJ(BasicNewsRecipe):
|
||||
pt.close()
|
||||
return pt.name
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
def preprocess_html(self, soup):
|
||||
for img in soup.findAll('img', attrs={'old-src':True}):
|
||||
img['src'] = img['old-src']
|
||||
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
|
||||
@ -114,7 +115,6 @@ class WSJ(BasicNewsRecipe):
|
||||
h2.extract()
|
||||
return soup
|
||||
|
||||
|
||||
# login {{{
|
||||
|
||||
def get_browser_for_wsj(self, *a, **kw):
|
||||
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
|
||||
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
|
||||
return br
|
||||
|
||||
if needs_subscription:
|
||||
if False and needs_subscription: # disabled as we currently use archive.is
|
||||
def get_browser(self, *a, **kw):
|
||||
from pprint import pprint
|
||||
pprint
|
||||
|
Loading…
x
Reference in New Issue
Block a user