#!/usr/bin/env python # vim:fileencoding=utf-8 from datetime import datetime from calibre.web.feeds.news import BasicNewsRecipe, classes class TheWeek(BasicNewsRecipe): title = u'The Week' description = ( 'The Week is the best selling general interest English news magazine. The magazine covers politics, entertainment,' ' social issues, trends, technology, lifestyle and everything else you should be knowing. Best downloaded on Mondays.') language = 'en_IN' __author__ = 'unkn0wn' encoding = 'utf-8' no_stylesheets = True use_embedded_content = False ignore_duplicate_articles = {'url', 'title'} remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://www.theweek.in/content/dam/week/logo/The-Week-Logo-Big.png' keep_only_tags = [classes('article-post section-heading element11-page-content')] remove_tags = [classes('article-highlights sharebar')] remove_tags_after = [classes('articlecontentbody')] extra_css = ''' em, blockquote { color: #202020; } .article-image, .article-imgbox { text-align:center; font-size:small; } .article-info { font-size:small; } ''' recipe_specific_options = { 'date': { 'short': 'The date of the edition to download (YYYY.MM.DD format)', 'long': 'For example, 2024.06.30' } } def get_cover_url(self): d = self.recipe_specific_options.get('date') if not (d and isinstance(d, str)): soup = self.index_to_soup( 'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/' ) for citem in soup.findAll( 'meta', content=lambda s: s and s.endswith('view/3.jpg') ): return citem['content'] def parse_index(self): issue = 'https://www.theweek.in/theweek.html' d = self.recipe_specific_options.get('date') if d and isinstance(d, str): issue = 'https://www.theweek.in/theweek.' + d + '.html' soup = self.index_to_soup(issue) ans = [] d = datetime.today() for a in soup.findAll( 'a', href=lambda x: x and '/' + d.strftime('%Y') + '/' in x ): url = a['href'] title = self.tag_to_string(a).strip() if not url or not title: continue self.log('\t', title) self.log('\t\t', url) ans.append({'title': title, 'url': url}) return [('Articles', ans)] def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src-web': True}): img['src'] = img['data-src-web'] return soup