diff --git a/recipes/ainonline.recipe b/recipes/ainonline.recipe new file mode 100644 index 0000000000..ad930d6906 --- /dev/null +++ b/recipes/ainonline.recipe @@ -0,0 +1,102 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2019, Jose Ortiz + +from __future__ import (unicode_literals, division, absolute_import, + print_function) +from calibre.web.feeds.recipes import BasicNewsRecipe +from pprint import pformat + + +INDEX = 'https://www.ainonline.com/' + + +def absurl(url): + if url.startswith('/'): + url = INDEX + url[1:] + return url + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + +class AINOnline(BasicNewsRecipe): + title = 'Aviation International News' + __author__ = 'Jose Ortiz' + description = ('Aviation International News covers all sectors of the aviation' + ' industry, from business aviation to air transport to defense and' + ' unmanned aerial vehicles.') + language = 'en' + encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True + masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg' + keep_only_tags=[classes('main-content')] + remove_tags = [ + dict(name=['button','input']), + dict(attrs={'class': lambda x: x and 'comments' in x}) + ] + + def parse_index(self): + + soup = self.index_to_soup(INDEX) + + # css selectors for articles + # .view-content [class *= 'featured-story'] + # .view-content .views-row + article_attrs = { + 'class': lambda x: x and ( + 'featured-story' in x + or frozenset(['views-row']).intersection( + frozenset(x.split())))} + + ans = [] + + for section in soup.findAll(**classes('view-content')): + + if section.findParent( + attrs=dict(id='featured')) is not None: + current_section = 'Featured' + elif section.findParent( + attrs=dict( + id='home-top-stories')) is not None: + current_section = 'Top Stories' + elif section.findParent( + attrs=dict( + id='quicktabs-container-latest_trending' + )) is not None: + current_section = 'Latest/Trending' + else: + current_section = 'Articles' + + articles = [] + for div in section.findAll(attrs=article_attrs): + if frozenset(['views-row']).intersection( + frozenset(div['class'].split())): + a = div.find(**classes('title')).a + elif 'featured-story' in div['class']: + a = div.find( + lambda tag: tag.name == 'a' + and tag.find(['h1','h2','h3','h4','h5','h6']) + is not None) + title = self.tag_to_string(a) + url = absurl(a['href']) + desc = '' + r = div.find(**classes('teaser')) + if r is not None: + desc = self.tag_to_string(r) + articles.append( + {'title': title, 'url': url, 'description': desc}) + if articles: + for title, articles_ in ans: + if current_section == title: + articles_.extend(articles) + break + else: + ans.append((current_section, articles)) + + self.log(pformat(ans)) + return ans