From d62dab1e2e7c1f30ffb29049caa9381ba9d31a20 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 18 Oct 2023 06:41:13 +0530 Subject: [PATCH] make subscription for WSJ optional as recipe currently uses archive.is --- recipes/wsj.recipe | 12 ++++++------ recipes/wsj_free.recipe | 14 +++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index c39565c6d1..fa9377c7e0 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -10,7 +10,8 @@ from base64 import standard_b64encode from datetime import date, timedelta from mechanize import Request -from calibre.web.feeds.news import BasicNewsRecipe, classes +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select try: @@ -22,10 +23,8 @@ try: except ImportError: from urllib import quote -from calibre.scraper.simple import read_url -from calibre.ptempfile import PersistentTemporaryFile -needs_subscription = True +needs_subscription = 'optional' def substring_classes(classes): @@ -84,6 +83,7 @@ class WSJ(BasicNewsRecipe): articles_are_obfuscated = True def get_obfuscated_article(self, url): + from calibre.scraper.simple import read_url br = self.get_browser() br.set_handle_redirect(False) try: @@ -96,7 +96,7 @@ class WSJ(BasicNewsRecipe): pt.close() return pt.name - def preprocess_html(self, soup): + def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'old-src':True}): img['src'] = img['old-src'] for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}): @@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe): br.set_cookie('ccpaApplies', 'false', '.wsj.com') return br - if needs_subscription: + if False and needs_subscription: # disabled as we currently use archive.is def get_browser(self, *a, **kw): from pprint import pprint pprint diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 97bf2064e1..3099d9fff8 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -10,7 +10,8 @@ from base64 import standard_b64encode from datetime import date, timedelta from mechanize import Request -from calibre.web.feeds.news import BasicNewsRecipe, classes +from calibre.ptempfile import PersistentTemporaryFile +from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select try: @@ -22,8 +23,6 @@ try: except ImportError: from urllib import quote -from calibre.scraper.simple import read_url -from calibre.ptempfile import PersistentTemporaryFile needs_subscription = False @@ -67,13 +66,14 @@ class WSJ(BasicNewsRecipe): #big-top-caption { font-size:small; text-align:center; } [data-type:"tagline"] { font-style:italic; color:#202020; } ''' - + keep_only_tags = [ dict(name=['h1', 'h2']), dict(attrs={'aria-describedby':'big-top-caption'}), dict(attrs={'id':'big-top-caption'}), dict(name='article') ] + remove_tags = [ dict(name=['button', 'svg', 'ufc-follow-author-widget']), dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}), @@ -83,6 +83,7 @@ class WSJ(BasicNewsRecipe): articles_are_obfuscated = True def get_obfuscated_article(self, url): + from calibre.scraper.simple import read_url br = self.get_browser() br.set_handle_redirect(False) try: @@ -95,7 +96,7 @@ class WSJ(BasicNewsRecipe): pt.close() return pt.name - def preprocess_html(self, soup): + def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'old-src':True}): img['src'] = img['old-src'] for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}): @@ -114,7 +115,6 @@ class WSJ(BasicNewsRecipe): h2.extract() return soup - # login {{{ def get_browser_for_wsj(self, *a, **kw): @@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe): br.set_cookie('ccpaApplies', 'false', '.wsj.com') return br - if needs_subscription: + if False and needs_subscription: # disabled as we currently use archive.is def get_browser(self, *a, **kw): from pprint import pprint pprint