mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
make subscription for WSJ optional as recipe currently uses archive.is
This commit is contained in:
parent
9d7c20c267
commit
d62dab1e2e
@ -10,7 +10,8 @@ from base64 import standard_b64encode
|
|||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from css_selectors import Select
|
from css_selectors import Select
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -22,10 +23,8 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from calibre.scraper.simple import read_url
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
|
|
||||||
needs_subscription = True
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
|
|
||||||
def substring_classes(classes):
|
def substring_classes(classes):
|
||||||
@ -84,6 +83,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
articles_are_obfuscated = True
|
articles_are_obfuscated = True
|
||||||
def get_obfuscated_article(self, url):
|
def get_obfuscated_article(self, url):
|
||||||
|
from calibre.scraper.simple import read_url
|
||||||
br = self.get_browser()
|
br = self.get_browser()
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
try:
|
try:
|
||||||
@ -96,7 +96,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
pt.close()
|
pt.close()
|
||||||
return pt.name
|
return pt.name
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for img in soup.findAll('img', attrs={'old-src':True}):
|
for img in soup.findAll('img', attrs={'old-src':True}):
|
||||||
img['src'] = img['old-src']
|
img['src'] = img['old-src']
|
||||||
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
|
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
|
||||||
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
|
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
|
||||||
return br
|
return br
|
||||||
|
|
||||||
if needs_subscription:
|
if False and needs_subscription: # disabled as we currently use archive.is
|
||||||
def get_browser(self, *a, **kw):
|
def get_browser(self, *a, **kw):
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
pprint
|
pprint
|
||||||
|
@ -10,7 +10,8 @@ from base64 import standard_b64encode
|
|||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from css_selectors import Select
|
from css_selectors import Select
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -22,8 +23,6 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from calibre.scraper.simple import read_url
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
|
|
||||||
needs_subscription = False
|
needs_subscription = False
|
||||||
|
|
||||||
@ -67,13 +66,14 @@ class WSJ(BasicNewsRecipe):
|
|||||||
#big-top-caption { font-size:small; text-align:center; }
|
#big-top-caption { font-size:small; text-align:center; }
|
||||||
[data-type:"tagline"] { font-style:italic; color:#202020; }
|
[data-type:"tagline"] { font-style:italic; color:#202020; }
|
||||||
'''
|
'''
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name=['h1', 'h2']),
|
dict(name=['h1', 'h2']),
|
||||||
dict(attrs={'aria-describedby':'big-top-caption'}),
|
dict(attrs={'aria-describedby':'big-top-caption'}),
|
||||||
dict(attrs={'id':'big-top-caption'}),
|
dict(attrs={'id':'big-top-caption'}),
|
||||||
dict(name='article')
|
dict(name='article')
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['button', 'svg', 'ufc-follow-author-widget']),
|
dict(name=['button', 'svg', 'ufc-follow-author-widget']),
|
||||||
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}),
|
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}),
|
||||||
@ -83,6 +83,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
articles_are_obfuscated = True
|
articles_are_obfuscated = True
|
||||||
def get_obfuscated_article(self, url):
|
def get_obfuscated_article(self, url):
|
||||||
|
from calibre.scraper.simple import read_url
|
||||||
br = self.get_browser()
|
br = self.get_browser()
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
try:
|
try:
|
||||||
@ -95,7 +96,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
pt.close()
|
pt.close()
|
||||||
return pt.name
|
return pt.name
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for img in soup.findAll('img', attrs={'old-src':True}):
|
for img in soup.findAll('img', attrs={'old-src':True}):
|
||||||
img['src'] = img['old-src']
|
img['src'] = img['old-src']
|
||||||
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
|
for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
|
||||||
@ -114,7 +115,6 @@ class WSJ(BasicNewsRecipe):
|
|||||||
h2.extract()
|
h2.extract()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
# login {{{
|
# login {{{
|
||||||
|
|
||||||
def get_browser_for_wsj(self, *a, **kw):
|
def get_browser_for_wsj(self, *a, **kw):
|
||||||
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
|
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
|
||||||
return br
|
return br
|
||||||
|
|
||||||
if needs_subscription:
|
if False and needs_subscription: # disabled as we currently use archive.is
|
||||||
def get_browser(self, *a, **kw):
|
def get_browser(self, *a, **kw):
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
pprint
|
pprint
|
||||||
|
Loading…
x
Reference in New Issue
Block a user