#!/usr/bin/env python import json import re from calibre.web.feeds.news import BasicNewsRecipe def absurl(url): if url.startswith('/'): return 'https://swarajyamag.com' + url return url html_entities = {'"': '"', ''': "'", '<': '<', '>': '>', '&': '&'} class SwarajyaMag(BasicNewsRecipe): title = 'Swarajya Magazine' __author__ = 'unkn0wn' description = 'Swarajya - a big tent for liberal right of centre discourse that reaches out, engages and caters to the new India.' language = 'en_IN' remove_javascript = True use_embedded_content = False encoding = 'utf-8' recipe_specific_options = { 'issue': { 'short': 'The edition URL ', } } def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'src': True}): img['src'] = img['src'].split('?')[0] + '?w=600' return soup def parse_index(self): d = self.recipe_specific_options.get('issue') if d and isinstance(d, str): url = d else: soup = self.index_to_soup('https://swarajyamag.com/all-issues') a = soup.find('a', href=lambda x: x and x.startswith('/issue/')) url = absurl(a['href']) self.log('Downloading issue:', url) soup = self.index_to_soup(url) ans = [] cont = soup.find(attrs={'id': 'container'}) self.cover_url = ( cont.find('a', href=lambda x: x and x.startswith('/issue/')) .img['src'] .split('?')[0] + '?w=600' ) for div in cont.findAll('div', attrs={'class': 'rounded'}): url = div.findParent('a')['href'] if url.startswith('/'): url = 'https://swarajyamag.com' + url h4 = div.find('h4') title = self.tag_to_string(h4) d = h4.next_sibling.div desc = 'By ' + self.tag_to_string(d).strip() self.log(title, ' at ', url, '\n', desc) ans.append({'title': title, 'url': url, 'description': desc}) return [('Articles', ans)] def preprocess_raw_html(self, raw, url): app = re.search( r'