make subscription for WSJ optional as recipe currently uses archive.is

2025-07-09 03:04:10 -04:00 · 2023-10-18 06:41:13 +05:30 · 2023-10-18 06:41:13 +05:30 · d62dab1e2e
commit d62dab1e2e
parent 9d7c20c267
2 changed files with 13 additions and 13 deletions
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -10,7 +10,8 @@ from base64 import standard_b64encode
 from datetime import date, timedelta
 from mechanize import Request

-from calibre.web.feeds.news import BasicNewsRecipe, classes
+from calibre.ptempfile import PersistentTemporaryFile
+from calibre.web.feeds.news import BasicNewsRecipe
 from css_selectors import Select

 try:
@ -22,10 +23,8 @@ try:
 except ImportError:
    from urllib import quote

-from calibre.scraper.simple import read_url
-from calibre.ptempfile import PersistentTemporaryFile

-needs_subscription = True
+needs_subscription = 'optional'


 def substring_classes(classes):
@ -84,6 +83,7 @@ class WSJ(BasicNewsRecipe):

    articles_are_obfuscated = True
    def get_obfuscated_article(self, url):
+        from calibre.scraper.simple import read_url
        br = self.get_browser()
        br.set_handle_redirect(False)
        try:
@ -96,7 +96,7 @@ class WSJ(BasicNewsRecipe):
        pt.close()
        return pt.name

-   def preprocess_html(self, soup):                     
+    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'old-src':True}):
            img['src'] = img['old-src']
        for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
        br.set_cookie('ccpaApplies', 'false', '.wsj.com')
        return br

-    if needs_subscription:
+    if False and needs_subscription:  # disabled as we currently use archive.is
        def get_browser(self, *a, **kw):
            from pprint import pprint
            pprint
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@ -10,7 +10,8 @@ from base64 import standard_b64encode
 from datetime import date, timedelta
 from mechanize import Request

-from calibre.web.feeds.news import BasicNewsRecipe, classes
+from calibre.ptempfile import PersistentTemporaryFile
+from calibre.web.feeds.news import BasicNewsRecipe
 from css_selectors import Select

 try:
@ -22,8 +23,6 @@ try:
 except ImportError:
    from urllib import quote

-from calibre.scraper.simple import read_url
-from calibre.ptempfile import PersistentTemporaryFile

 needs_subscription = False

@ -67,13 +66,14 @@ class WSJ(BasicNewsRecipe):
        #big-top-caption { font-size:small; text-align:center; }
        [data-type:"tagline"] { font-style:italic; color:#202020; }
    '''
-    
+
    keep_only_tags = [
        dict(name=['h1', 'h2']),
        dict(attrs={'aria-describedby':'big-top-caption'}),
        dict(attrs={'id':'big-top-caption'}),
        dict(name='article')
    ]
+
    remove_tags = [
        dict(name=['button', 'svg', 'ufc-follow-author-widget']),
        dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next']}),
@ -83,6 +83,7 @@ class WSJ(BasicNewsRecipe):

    articles_are_obfuscated = True
    def get_obfuscated_article(self, url):
+        from calibre.scraper.simple import read_url
        br = self.get_browser()
        br.set_handle_redirect(False)
        try:
@ -95,7 +96,7 @@ class WSJ(BasicNewsRecipe):
        pt.close()
        return pt.name

-    def preprocess_html(self, soup):                     
+    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'old-src':True}):
            img['src'] = img['old-src']
        for p in soup.findAll('div', attrs={'data-type':['paragraph', 'image']}):
@ -114,7 +115,6 @@ class WSJ(BasicNewsRecipe):
                h2.extract()
        return soup

-
    # login {{{

    def get_browser_for_wsj(self, *a, **kw):
@ -124,7 +124,7 @@ class WSJ(BasicNewsRecipe):
        br.set_cookie('ccpaApplies', 'false', '.wsj.com')
        return br

-    if needs_subscription:
+    if False and needs_subscription:  # disabled as we currently use archive.is
        def get_browser(self, *a, **kw):
            from pprint import pprint
            pprint