Fixup AINOnline for beautifulsoup 4

This commit is contained in:
Kovid Goyal 2019-06-12 13:25:04 +05:30
parent ea74df97ec
commit eafb79aaca
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,12 +2,10 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2019, Jose Ortiz <jlortiz84 at gmail.com> # License: GPLv3 Copyright: 2019, Jose Ortiz <jlortiz84 at gmail.com>
from __future__ import (unicode_literals, division, absolute_import, from __future__ import (unicode_literals, division, absolute_import, print_function)
print_function)
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from pprint import pformat from pprint import pformat
INDEX = 'https://www.ainonline.com/' INDEX = 'https://www.ainonline.com/'
@ -19,24 +17,27 @@ def absurl(url):
def classes(classes): def classes(classes):
q = frozenset(classes.split(' ')) q = frozenset(classes.split(' '))
return dict(attrs={ return dict(
'class': lambda x: x and frozenset(x.split()).intersection(q)}) attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
)
class AINOnline(BasicNewsRecipe): class AINOnline(BasicNewsRecipe):
title = 'Aviation International News' title = 'Aviation International News'
__author__ = 'Jose Ortiz' __author__ = 'Jose Ortiz'
description = ('Aviation International News covers all sectors of the aviation' description = (
' industry, from business aviation to air transport to defense and' 'Aviation International News covers all sectors of the aviation'
' unmanned aerial vehicles.') ' industry, from business aviation to air transport to defense and'
' unmanned aerial vehicles.'
)
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg' masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg'
keep_only_tags=[classes('main-content')] keep_only_tags = [classes('main-content')]
remove_tags = [ remove_tags = [
dict(name=['button','input']), dict(name=['button', 'input']),
dict(attrs={'class': lambda x: x and 'comments' in x}) dict(attrs={'class': lambda x: x and 'comments' in x})
] ]
@ -48,50 +49,48 @@ class AINOnline(BasicNewsRecipe):
# .view-content [class *= 'featured-story'] # .view-content [class *= 'featured-story']
# .view-content .views-row # .view-content .views-row
article_attrs = { article_attrs = {
'class': lambda x: x and ( 'class':
'featured-story' in x lambda x: x and (
or frozenset(['views-row']).intersection( 'featured-story' in x or frozenset(['views-row']).
frozenset(x.split())))} intersection(frozenset(x.split()))
)
}
ans = [] ans = []
for section in soup.findAll(**classes('view-content')): for section in soup.findAll(**classes('view-content')):
if section.findParent( if section.findParent(attrs=dict(id='featured')) is not None:
attrs=dict(id='featured')) is not None:
current_section = 'Featured' current_section = 'Featured'
elif section.findParent( elif section.findParent(attrs=dict(id='home-top-stories')) is not None:
attrs=dict(
id='home-top-stories')) is not None:
current_section = 'Top Stories' current_section = 'Top Stories'
elif section.findParent( elif section.findParent(
attrs=dict( attrs=dict(id='quicktabs-container-latest_trending')
id='quicktabs-container-latest_trending' ) is not None:
)) is not None:
current_section = 'Latest/Trending' current_section = 'Latest/Trending'
else: else:
current_section = 'Articles' current_section = 'Articles'
articles = [] articles = []
for div in section.findAll(attrs=article_attrs): for div in section.findAll(attrs=article_attrs):
if frozenset(['views-row']).intersection( if frozenset(['views-row']).intersection(frozenset(div['class'])):
frozenset(div['class'].split())):
a = div.find(**classes('title')).a a = div.find(**classes('title')).a
elif 'featured-story' in div['class']: elif 'featured-story' in div['class']:
a = div.find( a = div.find(
lambda tag: tag.name == 'a' lambda tag: tag.name == 'a' and tag.
and tag.find(['h1','h2','h3','h4','h5','h6']) find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) is not None
is not None) )
else:
continue
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = absurl(a['href']) url = absurl(a['href'])
desc = '' desc = ''
r = div.find(**classes('teaser')) r = div.find(**classes('teaser'))
if r is not None: if r is not None:
desc = self.tag_to_string(r) desc = self.tag_to_string(r)
articles.append( articles.append({'title': title, 'url': url, 'description': desc})
{'title': title, 'url': url, 'description': desc})
if articles: if articles:
for title, articles_ in ans: for title, articles_ in ans:
if current_section == title: if current_section == title:
articles_.extend(articles) articles_.extend(articles)
break break