Update wsj.recipe

2025-08-11 09:13:57 -04:00 · 2024-06-12 10:44:58 +05:30 · 2024-06-12 10:44:58 +05:30 · 9f9cafaf1c
commit 9f9cafaf1c
parent 2dedc79fcd
1 changed files with 2 additions and 47 deletions
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -1,12 +1,10 @@
 import json
 import re
 import time
 from itertools import zip_longest 
 from datetime import datetime, timedelta
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 from html5_parser import parse
 # Past 6 editions are available for download.
 # For available past editions see log and set date to, for example, '20240513'.
@ -47,7 +45,6 @@ class WSJ(BasicNewsRecipe):
    '''
    remove_tags = [
        dict(name=['nav', 'svg', 'iframe', 'source']),
        dict(name='panel', attrs={'id':'summary-image'}),
        dict(name='panel', attrs={'layout':'inline'}),
        dict(name='panel', attrs={'embed':'inner-article-ad'}),
@ -57,16 +54,11 @@ class WSJ(BasicNewsRecipe):
            'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
            'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
        ]}),
        dict(attrs={'data-inset_type':'dynamic'}),
        dict(attrs={'data-block':'dynamic-inset'})
    ]
    remove_tags_before = [
        dict(name='p', attrs={'id':'orig-pubdate-string'})
    ]
    remove_tags_after = [
        dict(name='article')
    ]
    def preprocess_html(self, soup):
        jpml = soup.find('jpml')
@ -88,21 +80,6 @@ class WSJ(BasicNewsRecipe):
        if url:
            url['title'] = self.tag_to_string(url).strip()
            url.string = ''
        for figc in soup.findAll('figcaption'):
            figc['class'] = 'figc'
        col = soup.find('div', text = re.compile('What to Read Next'))
        if col:
            div = col.findParent('div')
            if div:
                div.extract()
        time = soup.find('time')
        if time:
            p = time.findParent('div')
            if p:
                p.name = 'p'
        for img in soup.findAll('img', src=True):
            if img['src'].endswith('/OR'):
                img['src'] = img['src'][:-3]
        panel = soup.find('panel', attrs={'id':'metadata'})
        if panel:
            buck = panel.find('p', attrs={'id':'media-bucket'})
@ -185,9 +162,6 @@ class WSJ(BasicNewsRecipe):
                    for art in data:
                        title = data[art]['headline']
                        desc = data[art]['summary']
                        if 'articleWebViewLink' in data[art]:
                            url = data[art]['articleWebViewLink']
                        else:
                        url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename']
                        self.log('          ', title, '\n\t', desc)
                        articles.append({'title': title, 'description':desc, 'url': url})
@ -195,28 +169,9 @@ class WSJ(BasicNewsRecipe):
        return feeds
    def preprocess_raw_html(self, raw, url):
        if '/webview/' not in url:
            root = parse(raw)
            for x in root.xpath('//image'):
                x.tag = 'img'
        return BeautifulSoup(raw).prettify()
        else:
            soup = BeautifulSoup(raw)
            url = soup.find('meta', attrs={'property':'og:url'})
            if url:
                h1 = soup.find('h1')
                if h1:
                    h1['title'] = url['content']
            h2 = soup.find('h2')
            if h2:
                h2['id'] = 'subhed'
                h2.name = 'p'
            return soup.prettify()
    def populate_article_metadata(self, article, soup, first):
        lnk = soup.find('p', attrs={'id':'share-link'})
        if lnk:
            article.url = lnk['title']
        art = soup.find('h1', attrs={'title':True})
        if art:
            article.url = art['title']