From eafb79aaca7143a2ec22e6d8c619045f3a6e2a8a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Jun 2019 13:25:04 +0530 Subject: [PATCH] Fixup AINOnline for beautifulsoup 4 --- recipes/ainonline.recipe | 59 ++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/recipes/ainonline.recipe b/recipes/ainonline.recipe index ad930d6906..18b24b499c 100644 --- a/recipes/ainonline.recipe +++ b/recipes/ainonline.recipe @@ -2,12 +2,10 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2019, Jose Ortiz -from __future__ import (unicode_literals, division, absolute_import, - print_function) +from __future__ import (unicode_literals, division, absolute_import, print_function) from calibre.web.feeds.recipes import BasicNewsRecipe from pprint import pformat - INDEX = 'https://www.ainonline.com/' @@ -19,24 +17,27 @@ def absurl(url): def classes(classes): q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + return dict( + attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} + ) class AINOnline(BasicNewsRecipe): title = 'Aviation International News' __author__ = 'Jose Ortiz' - description = ('Aviation International News covers all sectors of the aviation' - ' industry, from business aviation to air transport to defense and' - ' unmanned aerial vehicles.') + description = ( + 'Aviation International News covers all sectors of the aviation' + ' industry, from business aviation to air transport to defense and' + ' unmanned aerial vehicles.' + ) language = 'en' encoding = 'utf-8' no_stylesheets = True remove_javascript = True masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg' - keep_only_tags=[classes('main-content')] + keep_only_tags = [classes('main-content')] remove_tags = [ - dict(name=['button','input']), + dict(name=['button', 'input']), dict(attrs={'class': lambda x: x and 'comments' in x}) ] @@ -48,50 +49,48 @@ class AINOnline(BasicNewsRecipe): # .view-content [class *= 'featured-story'] # .view-content .views-row article_attrs = { - 'class': lambda x: x and ( - 'featured-story' in x - or frozenset(['views-row']).intersection( - frozenset(x.split())))} + 'class': + lambda x: x and ( + 'featured-story' in x or frozenset(['views-row']). + intersection(frozenset(x.split())) + ) + } ans = [] for section in soup.findAll(**classes('view-content')): - if section.findParent( - attrs=dict(id='featured')) is not None: + if section.findParent(attrs=dict(id='featured')) is not None: current_section = 'Featured' - elif section.findParent( - attrs=dict( - id='home-top-stories')) is not None: + elif section.findParent(attrs=dict(id='home-top-stories')) is not None: current_section = 'Top Stories' elif section.findParent( - attrs=dict( - id='quicktabs-container-latest_trending' - )) is not None: + attrs=dict(id='quicktabs-container-latest_trending') + ) is not None: current_section = 'Latest/Trending' else: current_section = 'Articles' articles = [] for div in section.findAll(attrs=article_attrs): - if frozenset(['views-row']).intersection( - frozenset(div['class'].split())): + if frozenset(['views-row']).intersection(frozenset(div['class'])): a = div.find(**classes('title')).a elif 'featured-story' in div['class']: a = div.find( - lambda tag: tag.name == 'a' - and tag.find(['h1','h2','h3','h4','h5','h6']) - is not None) + lambda tag: tag.name == 'a' and tag. + find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) is not None + ) + else: + continue title = self.tag_to_string(a) url = absurl(a['href']) desc = '' r = div.find(**classes('teaser')) if r is not None: desc = self.tag_to_string(r) - articles.append( - {'title': title, 'url': url, 'description': desc}) + articles.append({'title': title, 'url': url, 'description': desc}) if articles: - for title, articles_ in ans: + for title, articles_ in ans: if current_section == title: articles_.extend(articles) break