diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index 68795c062a..cf6df01729 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -114,3 +114,9 @@ class Guardian(BasicNewsRecipe): feeds = list(self.parse_section(self.base_url)) feeds += list(self.parse_section('https://www.theguardian.com/uk/sport')) return feeds + + def preprocess_html(self, soup): + for table in soup.findAll('table'): + if len(table.findAll('tr')) > 20: + table.decompose() + return soup diff --git a/recipes/mit_technology_review.recipe b/recipes/mit_technology_review.recipe index 08bbd0b9fc..364fdc02a9 100644 --- a/recipes/mit_technology_review.recipe +++ b/recipes/mit_technology_review.recipe @@ -4,11 +4,12 @@ from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2015 Michael Marotta ' # Written April 2015 -# Last edited 08/2022 +# Last edited 07/2024 ''' technologyreview.com ''' import json +import re from collections import OrderedDict from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes @@ -38,7 +39,6 @@ class MitTechnologyReview(BasicNewsRecipe): ' This is different than the recipe named simply "Technology Review"' ' which downloads the rss feed with daily articles from the website.' ) - INDEX = 'http://www.technologyreview.com/magazine/' language = 'en' encoding = 'utf-8' tags = 'news, technology, science' @@ -65,22 +65,19 @@ class MitTechnologyReview(BasicNewsRecipe): ), ] - def get_cover_url(self): - soup = self.index_to_soup('https://www.technologyreview.com/') - if script := soup.find('script', id='preload'): - JSON = script.contents[0].split('magazineCover\":')[1].strip() - data = json.JSONDecoder().raw_decode(JSON)[0] - return data['config']['src'] - def parse_index(self): - soup = self.index_to_soup(self.INDEX) - issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')}) - time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')}) - desc = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__description')}) - self.title = 'MIT Tech Review ' + self.tag_to_string(issue) - self.description = self.tag_to_string(desc) - self.timefmt = ' [' + self.tag_to_string(time) + ']' - self.log('Downloading issue: ', self.timefmt) + # for past editions, change the issue link below + issue = 'http://www.technologyreview.com/magazine/' + soup = self.index_to_soup(issue) + if script := soup.find('script', id='preload'): + raw = script.contents[0] + m = re.search(r'\"children\":\[{\"name\":\"magazine-hero\"', raw) + spl = re.split(r"(?=\{)", raw[m.start():], 1)[1] + data = json.JSONDecoder().raw_decode(spl)[0] + self.cover_url = data['children'][0]['config']['src'] + '?fit=572,786' + self.timefmt = ' [' + data['config']['issueDate'] + ']' + self.description = data['config']['description'] + self.title = 'MIT TR: ' + data['config']['title'] # parse articles feeds = OrderedDict() diff --git a/recipes/times_online.recipe b/recipes/times_online.recipe index b3a3315c33..6449f489a3 100644 --- a/recipes/times_online.recipe +++ b/recipes/times_online.recipe @@ -50,7 +50,7 @@ class times(BasicNewsRecipe): ] remove_tags = [ - dict(name=['svg']), + dict(name=['svg', 'times-datawrapper']), dict(attrs={'id':'iframe-wrapper'}), dict(attrs={'old-position':'sticky'}), prefixed_classes( diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index a0e2ac25e9..c0b42bdaee 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -156,7 +156,7 @@ class WSJ(BasicNewsRecipe): if '-pages_' in k: section = k.split('-pages_')[0].replace('_', ' ') if 'MAGAZINE' in section: - if not dt.strftime('%d') == 1: + if not dt.day == 1: continue self.log('Loading Magazine section') self.log(section) diff --git a/recipes/wsj_mag.recipe b/recipes/wsj_mag.recipe index 0a82cab0ae..c2f5db1b1d 100644 --- a/recipes/wsj_mag.recipe +++ b/recipes/wsj_mag.recipe @@ -111,6 +111,7 @@ class WSJ(BasicNewsRecipe): date = itm['date'] key = itm['key'] manifest = itm['manifest'] + self.title = itm['label'] dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) dt = dt.strftime('%b, %Y')