calibre/recipes/the_week.recipe
2022-04-04 15:14:51 +05:30

59 lines
1.9 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2021, Kovid Goyal <kovid at kovidgoyal.net>
from calibre.web.feeds.news import BasicNewsRecipe
def fix_title(title):
return title.replace('-', ' ').capitalize()
class TheWeek(BasicNewsRecipe):
title = u'The Week'
language = 'en_IN'
__author__ = 'Kovid Goyal'
encoding = 'utf-8'
oldest_article = 8 # days
max_articles_per_feed = 25
no_stylesheets = True
use_embedded_content = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'align', 'border', 'hspace']
feeds = [
('Cover Story', 'https://www.theweek.in/theweek/cover.rss'),
('Sports', 'https://www.theweek.in/theweek/sports.rss'),
('Current', 'https://www.theweek.in/theweek/current.rss'),
('Statescan', 'https://www.theweek.in/theweek/statescan.rss'),
('Leisure', 'https://www.theweek.in/theweek/leisure.rss'),
('Business', 'https://www.theweek.in/theweek/business.rss'),
('Specials', 'https://www.theweek.in/theweek/specials.rss'),
('More', 'https://www.theweek.in/theweek/more.rss'),
('Society', 'https://www.theweek.in/leisure/society.rss'),
]
def get_cover_url(self):
soup = self.index_to_soup(
'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/'
)
for citem in soup.findAll(
'meta', content=lambda s: s and s.endswith('view/3.jpg')
):
return citem['content']
def preprocess_html(self, soup):
a = soup.find('a')
if a:
a.name = 'div'
h2 = soup.find('h2')
if h2:
h2.string = fix_title(h2.string)
for p in soup.findAll('p'):
if p.string == '\xa0':
p.decompose()
return soup
def populate_article_metadata(self, article, soup, first):
article.title = fix_title(article.title)