calibre/recipes/swarajya.recipe
Kovid Goyal 22b5c958aa
...
2024-10-14 09:57:19 +05:30

92 lines
3.1 KiB
Python

#!/usr/bin/env python
import json
import re
from calibre.web.feeds.news import BasicNewsRecipe
def absurl(url):
if url.startswith('/'):
return 'https://swarajyamag.com' + url
return url
html_entities = {'&quot;': '"', '&apos;': "'", '&lt;': '<', '&gt;': '>', '&amp;': '&'}
class SwarajyaMag(BasicNewsRecipe):
title = 'Swarajya Magazine'
__author__ = 'unkn0wn'
description = 'Swarajya - a big tent for liberal right of centre discourse that reaches out, engages and caters to the new India.'
language = 'en_IN'
remove_javascript = True
use_embedded_content = False
encoding = 'utf-8'
recipe_specific_options = {
'issue': {
'short': 'The edition URL ',
}
}
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'src': True}):
img['src'] = img['src'].split('?')[0] + '?w=600'
return soup
def parse_index(self):
d = self.recipe_specific_options.get('issue')
if d and isinstance(d, str):
url = d
else:
soup = self.index_to_soup('https://swarajyamag.com/all-issues')
a = soup.find('a', href=lambda x: x and x.startswith('/issue/'))
url = absurl(a['href'])
self.log('Downloading issue:', url)
soup = self.index_to_soup(url)
ans = []
cont = soup.find(attrs={'id': 'container'})
self.cover_url = (
cont.find('a', href=lambda x: x and x.startswith('/issue/'))
.img['src']
.split('?')[0]
+ '?w=600'
)
for div in cont.findAll('div', attrs={'class': 'rounded'}):
url = div.findParent('a')['href']
if url.startswith('/'):
url = 'https://swarajyamag.com' + url
h4 = div.find('h4')
title = self.tag_to_string(h4)
d = h4.next_sibling.div
desc = 'By ' + self.tag_to_string(d).strip()
self.log(title, ' at ', url, '\n', desc)
ans.append({'title': title, 'url': url, 'description': desc})
return [('Articles', ans)]
def preprocess_raw_html(self, raw, url):
app = re.search(
r'<script type=\"application/ld\+json\">({\"headline.+})', raw
).group(1)
data = json.JSONDecoder().raw_decode(app)[0]
title = f'<h1>{data["headline"]}</h1>'
pattern = re.compile('|'.join(html_entities.keys()))
body = pattern.sub(lambda m: html_entities[m.group(0)], data['articleBody'])
image = desc = auth = sec = ''
if data.get('articleSection'):
sec = f'<div style="font-size: small;">{data["articleSection"]}</div>'
if data.get('author'):
auth = f'<p style="font-size: small;>By {", ".join(x["name"] for x in data["author"])}</p>'
if data.get('image'):
image_url = data['image']['url']
image = f'<div><img src="{image_url}"></div>'
if data.get('description'):
desc = '<p>' + data['description'] + '</p>'
html = '<html><body>' + sec + title + desc + image + auth + body
return html