mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
2bb2573ff8
@ -114,3 +114,9 @@ class Guardian(BasicNewsRecipe):
|
|||||||
feeds = list(self.parse_section(self.base_url))
|
feeds = list(self.parse_section(self.base_url))
|
||||||
feeds += list(self.parse_section('https://www.theguardian.com/uk/sport'))
|
feeds += list(self.parse_section('https://www.theguardian.com/uk/sport'))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for table in soup.findAll('table'):
|
||||||
|
if len(table.findAll('tr')) > 20:
|
||||||
|
table.decompose()
|
||||||
|
return soup
|
||||||
|
@ -4,11 +4,12 @@ from __future__ import unicode_literals
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
|
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
|
||||||
# Written April 2015
|
# Written April 2015
|
||||||
# Last edited 08/2022
|
# Last edited 07/2024
|
||||||
'''
|
'''
|
||||||
technologyreview.com
|
technologyreview.com
|
||||||
'''
|
'''
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
||||||
@ -38,7 +39,6 @@ class MitTechnologyReview(BasicNewsRecipe):
|
|||||||
' This is different than the recipe named simply "Technology Review"'
|
' This is different than the recipe named simply "Technology Review"'
|
||||||
' which downloads the rss feed with daily articles from the website.'
|
' which downloads the rss feed with daily articles from the website.'
|
||||||
)
|
)
|
||||||
INDEX = 'http://www.technologyreview.com/magazine/'
|
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
tags = 'news, technology, science'
|
tags = 'news, technology, science'
|
||||||
@ -65,22 +65,19 @@ class MitTechnologyReview(BasicNewsRecipe):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
soup = self.index_to_soup('https://www.technologyreview.com/')
|
|
||||||
if script := soup.find('script', id='preload'):
|
|
||||||
JSON = script.contents[0].split('magazineCover\":')[1].strip()
|
|
||||||
data = json.JSONDecoder().raw_decode(JSON)[0]
|
|
||||||
return data['config']['src']
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(self.INDEX)
|
# for past editions, change the issue link below
|
||||||
issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')})
|
issue = 'http://www.technologyreview.com/magazine/'
|
||||||
time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')})
|
soup = self.index_to_soup(issue)
|
||||||
desc = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__description')})
|
if script := soup.find('script', id='preload'):
|
||||||
self.title = 'MIT Tech Review ' + self.tag_to_string(issue)
|
raw = script.contents[0]
|
||||||
self.description = self.tag_to_string(desc)
|
m = re.search(r'\"children\":\[{\"name\":\"magazine-hero\"', raw)
|
||||||
self.timefmt = ' [' + self.tag_to_string(time) + ']'
|
spl = re.split(r"(?=\{)", raw[m.start():], 1)[1]
|
||||||
self.log('Downloading issue: ', self.timefmt)
|
data = json.JSONDecoder().raw_decode(spl)[0]
|
||||||
|
self.cover_url = data['children'][0]['config']['src'] + '?fit=572,786'
|
||||||
|
self.timefmt = ' [' + data['config']['issueDate'] + ']'
|
||||||
|
self.description = data['config']['description']
|
||||||
|
self.title = 'MIT TR: ' + data['config']['title']
|
||||||
|
|
||||||
# parse articles
|
# parse articles
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
|
@ -50,7 +50,7 @@ class times(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['svg']),
|
dict(name=['svg', 'times-datawrapper']),
|
||||||
dict(attrs={'id':'iframe-wrapper'}),
|
dict(attrs={'id':'iframe-wrapper'}),
|
||||||
dict(attrs={'old-position':'sticky'}),
|
dict(attrs={'old-position':'sticky'}),
|
||||||
prefixed_classes(
|
prefixed_classes(
|
||||||
|
@ -156,7 +156,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
if '-pages_' in k:
|
if '-pages_' in k:
|
||||||
section = k.split('-pages_')[0].replace('_', ' ')
|
section = k.split('-pages_')[0].replace('_', ' ')
|
||||||
if 'MAGAZINE' in section:
|
if 'MAGAZINE' in section:
|
||||||
if not dt.strftime('%d') == 1:
|
if not dt.day == 1:
|
||||||
continue
|
continue
|
||||||
self.log('Loading Magazine section')
|
self.log('Loading Magazine section')
|
||||||
self.log(section)
|
self.log(section)
|
||||||
|
@ -111,6 +111,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
date = itm['date']
|
date = itm['date']
|
||||||
key = itm['key']
|
key = itm['key']
|
||||||
manifest = itm['manifest']
|
manifest = itm['manifest']
|
||||||
|
self.title = itm['label']
|
||||||
|
|
||||||
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
|
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
|
||||||
dt = dt.strftime('%b, %Y')
|
dt = dt.strftime('%b, %Y')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user