This commit is contained in:
Kovid Goyal 2024-07-06 11:31:40 +05:30
commit 2bb2573ff8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 23 additions and 19 deletions

View File

@ -114,3 +114,9 @@ class Guardian(BasicNewsRecipe):
feeds = list(self.parse_section(self.base_url)) feeds = list(self.parse_section(self.base_url))
feeds += list(self.parse_section('https://www.theguardian.com/uk/sport')) feeds += list(self.parse_section('https://www.theguardian.com/uk/sport'))
return feeds return feeds
def preprocess_html(self, soup):
for table in soup.findAll('table'):
if len(table.findAll('tr')) > 20:
table.decompose()
return soup

View File

@ -4,11 +4,12 @@ from __future__ import unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>' __copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
# Written April 2015 # Written April 2015
# Last edited 08/2022 # Last edited 07/2024
''' '''
technologyreview.com technologyreview.com
''' '''
import json import json
import re
from collections import OrderedDict from collections import OrderedDict
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
@ -38,7 +39,6 @@ class MitTechnologyReview(BasicNewsRecipe):
' This is different than the recipe named simply "Technology Review"' ' This is different than the recipe named simply "Technology Review"'
' which downloads the rss feed with daily articles from the website.' ' which downloads the rss feed with daily articles from the website.'
) )
INDEX = 'http://www.technologyreview.com/magazine/'
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
tags = 'news, technology, science' tags = 'news, technology, science'
@ -65,22 +65,19 @@ class MitTechnologyReview(BasicNewsRecipe):
), ),
] ]
def get_cover_url(self):
soup = self.index_to_soup('https://www.technologyreview.com/')
if script := soup.find('script', id='preload'):
JSON = script.contents[0].split('magazineCover\":')[1].strip()
data = json.JSONDecoder().raw_decode(JSON)[0]
return data['config']['src']
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.INDEX) # for past editions, change the issue link below
issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')}) issue = 'http://www.technologyreview.com/magazine/'
time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')}) soup = self.index_to_soup(issue)
desc = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__description')}) if script := soup.find('script', id='preload'):
self.title = 'MIT Tech Review ' + self.tag_to_string(issue) raw = script.contents[0]
self.description = self.tag_to_string(desc) m = re.search(r'\"children\":\[{\"name\":\"magazine-hero\"', raw)
self.timefmt = ' [' + self.tag_to_string(time) + ']' spl = re.split(r"(?=\{)", raw[m.start():], 1)[1]
self.log('Downloading issue: ', self.timefmt) data = json.JSONDecoder().raw_decode(spl)[0]
self.cover_url = data['children'][0]['config']['src'] + '?fit=572,786'
self.timefmt = ' [' + data['config']['issueDate'] + ']'
self.description = data['config']['description']
self.title = 'MIT TR: ' + data['config']['title']
# parse articles # parse articles
feeds = OrderedDict() feeds = OrderedDict()

View File

@ -50,7 +50,7 @@ class times(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
dict(name=['svg']), dict(name=['svg', 'times-datawrapper']),
dict(attrs={'id':'iframe-wrapper'}), dict(attrs={'id':'iframe-wrapper'}),
dict(attrs={'old-position':'sticky'}), dict(attrs={'old-position':'sticky'}),
prefixed_classes( prefixed_classes(

View File

@ -156,7 +156,7 @@ class WSJ(BasicNewsRecipe):
if '-pages_' in k: if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ') section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' in section: if 'MAGAZINE' in section:
if not dt.strftime('%d') == 1: if not dt.day == 1:
continue continue
self.log('Loading Magazine section') self.log('Loading Magazine section')
self.log(section) self.log(section)

View File

@ -111,6 +111,7 @@ class WSJ(BasicNewsRecipe):
date = itm['date'] date = itm['date']
key = itm['key'] key = itm['key']
manifest = itm['manifest'] manifest = itm['manifest']
self.title = itm['label']
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b, %Y') dt = dt.strftime('%b, %Y')