Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-09 03:04:10 -04:00 · 2024-07-06 11:31:40 +05:30 · 2024-07-06 11:31:40 +05:30 · 2bb2573ff8
commit 2bb2573ff8
parent d01c4c03ac dd684bc70a
5 changed files with 23 additions and 19 deletions
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@ -114,3 +114,9 @@ class Guardian(BasicNewsRecipe):
        feeds = list(self.parse_section(self.base_url))
        feeds += list(self.parse_section('https://www.theguardian.com/uk/sport'))
        return feeds
    def preprocess_html(self, soup):
        for table in soup.findAll('table'):
            if len(table.findAll('tr')) > 20:
                table.decompose()
        return soup
--- a/recipes/mit_technology_review.recipe
+++ b/recipes/mit_technology_review.recipe
@ -4,11 +4,12 @@ from __future__ import unicode_literals
 __license__ = 'GPL v3'
 __copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
 # Written April 2015
-# Last edited 08/2022
+# Last edited 07/2024
 '''
 technologyreview.com
 '''
 import json
 import re
 from collections import OrderedDict
 from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
@ -38,7 +39,6 @@ class MitTechnologyReview(BasicNewsRecipe):
        ' This is different than the recipe named simply "Technology Review"'
        ' which downloads the rss feed with daily articles from the website.'
    )
    INDEX = 'http://www.technologyreview.com/magazine/'
    language = 'en'
    encoding = 'utf-8'
    tags = 'news, technology, science'
@ -65,22 +65,19 @@ class MitTechnologyReview(BasicNewsRecipe):
        ),
    ]
    def get_cover_url(self):
        soup = self.index_to_soup('https://www.technologyreview.com/')
        if script := soup.find('script', id='preload'):
            JSON = script.contents[0].split('magazineCover\":')[1].strip()
            data = json.JSONDecoder().raw_decode(JSON)[0]
            return data['config']['src']
    def parse_index(self):
-        soup = self.index_to_soup(self.INDEX)
+        # for past editions, change the issue link below 
-        issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')})
+        issue = 'http://www.technologyreview.com/magazine/'
-        time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')})
+        soup = self.index_to_soup(issue)
-        desc = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__description')})
+        if script := soup.find('script', id='preload'):
-        self.title = 'MIT Tech Review ' + self.tag_to_string(issue)
+            raw = script.contents[0]
-        self.description = self.tag_to_string(desc)
+            m = re.search(r'\"children\":\[{\"name\":\"magazine-hero\"', raw)
-        self.timefmt = ' [' + self.tag_to_string(time) + ']'
+            spl = re.split(r"(?=\{)", raw[m.start():], 1)[1]
-        self.log('Downloading issue: ', self.timefmt)
+            data = json.JSONDecoder().raw_decode(spl)[0]
            self.cover_url = data['children'][0]['config']['src'] + '?fit=572,786'
            self.timefmt = ' [' + data['config']['issueDate'] + ']'
            self.description = data['config']['description']
            self.title = 'MIT TR: ' + data['config']['title']
        # parse articles
        feeds = OrderedDict()
--- a/recipes/times_online.recipe
+++ b/recipes/times_online.recipe
@ -50,7 +50,7 @@ class times(BasicNewsRecipe):
    ]
    remove_tags = [
-        dict(name=['svg']),
+        dict(name=['svg', 'times-datawrapper']),
        dict(attrs={'id':'iframe-wrapper'}),
        dict(attrs={'old-position':'sticky'}),
        prefixed_classes(
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -156,7 +156,7 @@ class WSJ(BasicNewsRecipe):
                if '-pages_' in k:
                    section = k.split('-pages_')[0].replace('_', ' ')
                    if 'MAGAZINE' in section:
-                        if not dt.strftime('%d') == 1:
+                        if not dt.day == 1:
                            continue
                        self.log('Loading Magazine section')
                    self.log(section)
--- a/recipes/wsj_mag.recipe
+++ b/recipes/wsj_mag.recipe
@ -111,6 +111,7 @@ class WSJ(BasicNewsRecipe):
                date = itm['date']
                key = itm['key']
                manifest = itm['manifest']
                self.title = itm['label']
        dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
        dt = dt.strftime('%b, %Y')