Update The Independent UK

This commit is contained in:
Kovid Goyal 2018-06-02 15:42:20 +05:30
parent 924acd1d0c
commit 66be8fe65d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -5,6 +5,12 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class TheIndependentNew(BasicNewsRecipe):
title = u'The Independent'
@ -28,15 +34,11 @@ class TheIndependentNew(BasicNewsRecipe):
compress_news_images = True
keep_only_tags = [
dict(itemprop=['articleBody', 'headline', 'contentUrl']),
dict(attrs={'class': ['intro', 'author']}),
classes('headline sub-headline breadcrumb author publish-date hero-image body-content'),
]
remove_tags = [
dict(attrs={'class': lambda x: x and 'show-all' in x.split()}),
dict(attrs={'class': lambda x: x and 'context-sdl_editor_representation' in x.split()}),
dict(attrs={'data-scald-gallery': True}),
classes('inline-related inline-readmore ad-wrapper icon-gallery i-gallery')
]
remove_attributes = ['style']
def get_browser(self, *a, **kw):
@ -51,6 +53,10 @@ class TheIndependentNew(BasicNewsRecipe):
return br
def preprocess_html(self, soup):
for img in soup.findAll('amp-img'):
img.name = 'img'
img['srcset'] = ''
for div in soup.findAll(attrs={'class': 'full-gallery'}):
imgs = {}
for li in div.findAll('li', attrs={'data-gallery-item': True, 'data-original': True}):