Update The Week

This commit is contained in:
Kovid Goyal 2022-06-29 19:48:01 +05:30
parent 3a9d2c0270
commit 4ee2709924
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,36 +1,31 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2021, Kovid Goyal <kovid at kovidgoyal.net>
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime
def fix_title(title):
return title.replace('-', ' ').capitalize()
class TheWeek(BasicNewsRecipe): class TheWeek(BasicNewsRecipe):
title = u'The Week' title = u'The Week'
description = (
'The Week is the best selling general interest English news magazine. The magazine covers politics, entertainment,'
' social issues, trends, technology, lifestyle and everything else you should be knowing. Best downloaded on Mondays.')
language = 'en_IN' language = 'en_IN'
__author__ = 'Kovid Goyal' __author__ = 'unkn0wn'
encoding = 'utf-8' encoding = 'utf-8'
oldest_article = 8 # days
max_articles_per_feed = 25
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = False
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url', 'title'}
remove_attributes = ['style', 'align', 'border', 'hspace'] remove_attributes = ['style', 'height', 'width']
masthead_url = 'https://www.theweek.in/content/dam/week/logo/The-Week-Logo-Big.png'
feeds = [ keep_only_tags = [
('Cover Story', 'https://www.theweek.in/theweek/cover.rss'), dict(
('Sports', 'https://www.theweek.in/theweek/sports.rss'), name='div',
('Current', 'https://www.theweek.in/theweek/current.rss'), attrs={
('Statescan', 'https://www.theweek.in/theweek/statescan.rss'), 'class': [
('Leisure', 'https://www.theweek.in/theweek/leisure.rss'), 'article-title', 'article-image', 'articlecontentbody section',
('Business', 'https://www.theweek.in/theweek/business.rss'), 'element11-page-content'
('Specials', 'https://www.theweek.in/theweek/specials.rss'), ]
('More', 'https://www.theweek.in/theweek/more.rss'), }
('Society', 'https://www.theweek.in/leisure/society.rss'), ),
] ]
def get_cover_url(self): def get_cover_url(self):
@ -42,17 +37,24 @@ class TheWeek(BasicNewsRecipe):
): ):
return citem['content'] return citem['content']
def preprocess_html(self, soup): def parse_index(self):
a = soup.find('a') soup = self.index_to_soup('https://www.theweek.in/theweek.html')
if a: ans = []
a.name = 'div' d = datetime.today()
h2 = soup.find('h2')
if h2:
h2.string = fix_title(h2.string)
for p in soup.findAll('p'):
if p.string == '\xa0':
p.decompose()
return soup
def populate_article_metadata(self, article, soup, first): for a in soup.findAll(
article.title = fix_title(article.title) 'a', href=lambda x: x and '/' + d.strftime('%Y') + '/' in x
):
url = a['href']
title = self.tag_to_string(a).strip()
if not url or not title:
continue
self.log('\t', title)
self.log('\t\t', url)
ans.append({'title': title, 'url': url})
return [('Articles', ans)]
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src-web': True}):
img['src'] = img['data-src-web']
return soup