calibre/recipes/ainonline.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

102 lines
3.4 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2019, Jose Ortiz <jlortiz84 at gmail.com>
from __future__ import (unicode_literals, division, absolute_import, print_function)
from calibre.web.feeds.recipes import BasicNewsRecipe
from pprint import pformat
INDEX = 'https://www.ainonline.com/'
def absurl(url):
if url.startswith('/'):
url = INDEX + url[1:]
return url
def classes(classes):
q = frozenset(classes.split(' '))
return dict(
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
)
class AINOnline(BasicNewsRecipe):
title = 'Aviation International News'
__author__ = 'Jose Ortiz'
description = (
'Aviation International News covers all sectors of the aviation'
' industry, from business aviation to air transport to defense and'
' unmanned aerial vehicles.'
)
language = 'en'
encoding = 'utf-8'
no_stylesheets = True
remove_javascript = True
masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg'
keep_only_tags = [classes('main-content')]
remove_tags = [
dict(name=['button', 'input']),
dict(attrs={'class': lambda x: x and 'comments' in x})
]
def parse_index(self):
soup = self.index_to_soup(INDEX)
# css selectors for articles
# .view-content [class *= 'featured-story']
# .view-content .views-row
article_attrs = {
'class':
lambda x: x and (
'featured-story' in x or frozenset(['views-row']).
intersection(frozenset(x.split()))
)
}
ans = []
for section in soup.findAll(**classes('view-content')):
if section.findParent(attrs=dict(id='featured')) is not None:
current_section = 'Featured'
elif section.findParent(attrs=dict(id='home-top-stories')) is not None:
current_section = 'Top Stories'
elif section.findParent(
attrs=dict(id='quicktabs-container-latest_trending')
) is not None:
current_section = 'Latest/Trending'
else:
current_section = 'Articles'
articles = []
for div in section.findAll(attrs=article_attrs):
if 'views-row' in div['class']:
a = div.find(**classes('title')).a
elif 'featured-story' in div['class']:
a = div.find(
lambda tag: tag.name == 'a' and tag.
find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) is not None
)
else:
continue
title = self.tag_to_string(a)
url = absurl(a['href'])
desc = ''
r = div.find(**classes('teaser'))
if r is not None:
desc = self.tag_to_string(r)
articles.append({'title': title, 'url': url, 'description': desc})
if articles:
for title, articles_ in ans:
if current_section == title:
articles_.extend(articles)
break
else:
ans.append((current_section, articles))
self.log(pformat(ans))
return ans