mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fixup AINOnline for beautifulsoup 4
This commit is contained in:
parent
ea74df97ec
commit
eafb79aaca
@ -2,12 +2,10 @@
|
|||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
# License: GPLv3 Copyright: 2019, Jose Ortiz <jlortiz84 at gmail.com>
|
# License: GPLv3 Copyright: 2019, Jose Ortiz <jlortiz84 at gmail.com>
|
||||||
|
|
||||||
from __future__ import (unicode_literals, division, absolute_import,
|
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||||
print_function)
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
|
|
||||||
|
|
||||||
INDEX = 'https://www.ainonline.com/'
|
INDEX = 'https://www.ainonline.com/'
|
||||||
|
|
||||||
|
|
||||||
@ -19,24 +17,27 @@ def absurl(url):
|
|||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
q = frozenset(classes.split(' '))
|
q = frozenset(classes.split(' '))
|
||||||
return dict(attrs={
|
return dict(
|
||||||
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AINOnline(BasicNewsRecipe):
|
class AINOnline(BasicNewsRecipe):
|
||||||
title = 'Aviation International News'
|
title = 'Aviation International News'
|
||||||
__author__ = 'Jose Ortiz'
|
__author__ = 'Jose Ortiz'
|
||||||
description = ('Aviation International News covers all sectors of the aviation'
|
description = (
|
||||||
' industry, from business aviation to air transport to defense and'
|
'Aviation International News covers all sectors of the aviation'
|
||||||
' unmanned aerial vehicles.')
|
' industry, from business aviation to air transport to defense and'
|
||||||
|
' unmanned aerial vehicles.'
|
||||||
|
)
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg'
|
masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg'
|
||||||
keep_only_tags=[classes('main-content')]
|
keep_only_tags = [classes('main-content')]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['button','input']),
|
dict(name=['button', 'input']),
|
||||||
dict(attrs={'class': lambda x: x and 'comments' in x})
|
dict(attrs={'class': lambda x: x and 'comments' in x})
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -48,50 +49,48 @@ class AINOnline(BasicNewsRecipe):
|
|||||||
# .view-content [class *= 'featured-story']
|
# .view-content [class *= 'featured-story']
|
||||||
# .view-content .views-row
|
# .view-content .views-row
|
||||||
article_attrs = {
|
article_attrs = {
|
||||||
'class': lambda x: x and (
|
'class':
|
||||||
'featured-story' in x
|
lambda x: x and (
|
||||||
or frozenset(['views-row']).intersection(
|
'featured-story' in x or frozenset(['views-row']).
|
||||||
frozenset(x.split())))}
|
intersection(frozenset(x.split()))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
ans = []
|
ans = []
|
||||||
|
|
||||||
for section in soup.findAll(**classes('view-content')):
|
for section in soup.findAll(**classes('view-content')):
|
||||||
|
|
||||||
if section.findParent(
|
if section.findParent(attrs=dict(id='featured')) is not None:
|
||||||
attrs=dict(id='featured')) is not None:
|
|
||||||
current_section = 'Featured'
|
current_section = 'Featured'
|
||||||
elif section.findParent(
|
elif section.findParent(attrs=dict(id='home-top-stories')) is not None:
|
||||||
attrs=dict(
|
|
||||||
id='home-top-stories')) is not None:
|
|
||||||
current_section = 'Top Stories'
|
current_section = 'Top Stories'
|
||||||
elif section.findParent(
|
elif section.findParent(
|
||||||
attrs=dict(
|
attrs=dict(id='quicktabs-container-latest_trending')
|
||||||
id='quicktabs-container-latest_trending'
|
) is not None:
|
||||||
)) is not None:
|
|
||||||
current_section = 'Latest/Trending'
|
current_section = 'Latest/Trending'
|
||||||
else:
|
else:
|
||||||
current_section = 'Articles'
|
current_section = 'Articles'
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
for div in section.findAll(attrs=article_attrs):
|
for div in section.findAll(attrs=article_attrs):
|
||||||
if frozenset(['views-row']).intersection(
|
if frozenset(['views-row']).intersection(frozenset(div['class'])):
|
||||||
frozenset(div['class'].split())):
|
|
||||||
a = div.find(**classes('title')).a
|
a = div.find(**classes('title')).a
|
||||||
elif 'featured-story' in div['class']:
|
elif 'featured-story' in div['class']:
|
||||||
a = div.find(
|
a = div.find(
|
||||||
lambda tag: tag.name == 'a'
|
lambda tag: tag.name == 'a' and tag.
|
||||||
and tag.find(['h1','h2','h3','h4','h5','h6'])
|
find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) is not None
|
||||||
is not None)
|
)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = absurl(a['href'])
|
url = absurl(a['href'])
|
||||||
desc = ''
|
desc = ''
|
||||||
r = div.find(**classes('teaser'))
|
r = div.find(**classes('teaser'))
|
||||||
if r is not None:
|
if r is not None:
|
||||||
desc = self.tag_to_string(r)
|
desc = self.tag_to_string(r)
|
||||||
articles.append(
|
articles.append({'title': title, 'url': url, 'description': desc})
|
||||||
{'title': title, 'url': url, 'description': desc})
|
|
||||||
if articles:
|
if articles:
|
||||||
for title, articles_ in ans:
|
for title, articles_ in ans:
|
||||||
if current_section == title:
|
if current_section == title:
|
||||||
articles_.extend(articles)
|
articles_.extend(articles)
|
||||||
break
|
break
|
||||||
|
Loading…
x
Reference in New Issue
Block a user