calibre/recipes/himal_southasian.recipe
2022-11-27 10:26:04 +05:30

79 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from calibre.web.feeds.news import BasicNewsRecipe, classes
from datetime import datetime, timezone, timedelta
from calibre.utils.date import parse_date
class himal(BasicNewsRecipe):
title = 'Himal Southasian'
__author__ = 'unkn0wn'
description = ('Himal Southasian is Southasias first and only regional magazine of politics and culture.'
' For over 30 years, Himal Southasian has challenged nationalist orthodoxies, and covered the region with '
'imagination, rigour and irreverence, with contributions from some of the most interesting writers in the region.')
language = 'en_IN'
no_stylesheets = True
remove_attributes = ['height', 'width', 'style']
ignore_duplicate_articles = {'url'}
masthead_url = 'https://www.himalmag.com/wp-content/themes/himaltheme-child/images/logo.svg'
encoding = 'utf-8'
remove_empty_feeds = True
resolve_internal_links = True
oldest_article = 30 # days
extra_css = '''
.sub-row, .img-caption, .wp-caption-text, .comments-info-box {font-size:small;}
em, blockquote {color:#404040;}
'''
remove_tags = [
dict(name='header'),
dict(name='footer'),
classes('skip-link single-btm share-info title-info post-categories comment-btn comment-line'),
]
def parse_index(self):
sel = self.index_to_soup('https://www.himalmag.com/category/regions/')
nav_div = sel.find('div', attrs={'class':'category-sublist'})
section_list = []
for a in nav_div.findAll('a', href=True):
section_list.append(
(self.tag_to_string(a).strip(), a['href'])
)
feeds = []
# For each section title, fetch the article urls
for section in section_list:
section_title = section[0]
section_url = section[1]
self.log(section_title, section_url)
soup = self.index_to_soup(section_url)
articles = self.articles_from_soup(soup)
if articles:
feeds.append((section_title, articles))
return feeds
def articles_from_soup(self, soup):
ans = []
div = soup.find('div', attrs={'id':'loadmore-wrap'})
for h3 in div.findAll('h3'):
a = h3.find('a', href=True)
url = a['href']
title = self.tag_to_string(a)
desc = ''
exp = h3.findNext('div', attrs={'class':'content-except'})
if exp:
desc = self.tag_to_string(exp)
h4 = h3.findNext('h4')
if h4:
date = parse_date(self.tag_to_string(h4).split('|')[1].strip())
today = (datetime.now(timezone.utc)).replace(microsecond=0)
if (today - date) > timedelta(self.oldest_article):
url = ''
if not url or not title:
continue
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
ans.append({'title': title, 'description':desc, 'url': url})
return ans