From dddc7448d6adcedc81eb58483ca449a1dcde0c69 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 8 Jun 2025 14:32:35 +0530
Subject: [PATCH] Update WSJ. Magazine
---
recipes/wsj.recipe | 5 +-
recipes/wsj_mag.recipe | 179 ++++++++++++++++++++++++++++-------------
2 files changed, 124 insertions(+), 60 deletions(-)
diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe
index 802f040b36..a43559af2f 100644
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@@ -32,8 +32,9 @@ class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal'
__author__ = 'unkn0wn'
description = (
- 'The Print Edition of WSJ. The Wall Street Journal is your source for breaking news, analysis and insights from the U.S. and '
- "around the world, the world's leading business and finance publication."
+ 'The Print Edition of WSJ. The Wall Street Journal is your source '
+ 'for breaking news, analysis and insights from the U.S. and '
+ 'around the world, the world\'s leading business and finance publication.'
)
language = 'en_US'
masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'
diff --git a/recipes/wsj_mag.recipe b/recipes/wsj_mag.recipe
index a0c4fcdf78..b50464aceb 100644
--- a/recipes/wsj_mag.recipe
+++ b/recipes/wsj_mag.recipe
@@ -2,11 +2,32 @@
# vim:fileencoding=utf-8
import json
from itertools import zip_longest
+from urllib.parse import quote, urlencode
+from calibre import browser
+from calibre.ptempfile import PersistentTemporaryFile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes
+def get_article(article_id):
+ from mechanize import Request
+ mat_url = 'https://mats.mobile.dowjones.io/translate/' + article_id + '/jpml'
+ headers = {
+ 'User-Agent': 'okhttp/4.10.0',
+ 'Accept-Encoding': 'gzip',
+ 'Cache-Control': 'no-cache',
+ 'x-api-key': ('e''0''5''9''9''5''f''f''4''4''2''1''4''3''2''5''5''e''b''8''3''8''1''f''7''2''d''4''9''1''3''b''f''7''5''0''3''d''6''c'), # noqa: ISC001
+ }
+ br = browser()
+ req = Request(
+ mat_url,
+ headers=headers,
+ )
+ res = br.open(req)
+ return res.read()
+
+
class WSJ(BasicNewsRecipe):
title = 'WSJ. Magazine'
__author__ = 'unkn0wn'
@@ -21,13 +42,14 @@ class WSJ(BasicNewsRecipe):
no_stylesheets = True
remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True
+ simultaneous_downloads = 20
recipe_specific_options = {
'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
- 'default': '600'
- }
+ 'default': '600',
+ },
}
extra_css = '''
@@ -38,20 +60,35 @@ class WSJ(BasicNewsRecipe):
'''
remove_tags = [
- dict(name='panel', attrs={'id':'summary-image'}),
- dict(name='panel', attrs={'layout':'inline'}),
- dict(name='panel', attrs={'embed':'inner-article-ad'}),
- dict(name='span', attrs={'embed':'ticker'}),
+ dict(name='panel', attrs={'id': 'summary-image'}),
+ dict(name='panel', attrs={'layout': 'inline'}),
+ dict(name='panel', attrs={'embed': 'inner-article-ad'}),
+ dict(name='span', attrs={'embed': 'ticker'}),
classes('lamrelated-articles-inset-panel'),
- dict(name='p', attrs={'id':[
- 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
- 'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
- ]}),
+ dict(
+ name='p',
+ attrs={
+ 'id': [
+ 'keywords',
+ 'orig-pubdate-number',
+ 'type',
+ 'is-custom-flashline',
+ 'grouphed',
+ 'author-ids',
+ 'article-manifest',
+ 'body-extract',
+ 'category',
+ 'sub-category',
+ 'socialhed',
+ 'summary',
+ 'deckline',
+ 'article-flashline',
+ ]
+ },
+ ),
]
- remove_tags_before = [
- dict(name='p', attrs={'id':'orig-pubdate-string'})
- ]
+ remove_tags_before = [dict(name='p', attrs={'id': 'orig-pubdate-string'})]
def media_bucket(self, x):
res = '?width=600'
@@ -59,16 +96,24 @@ class WSJ(BasicNewsRecipe):
if w and isinstance(w, str):
res = '?width=' + w
if x.get('type', '') == 'image':
- if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']:
+ if (
+ x.get('subtype', '') == 'graphic'
+ or 'images.wsj.net' not in x['manifest-url']
+ ):
return '