improvements to atlantic recipe

removed some things that were uglifying it. added imgs in addition to lead. smaller captions to distinguish from body. right-aligned credits
This commit is contained in:
jfhutson 2017-02-17 22:08:44 -06:00 committed by GitHub
parent 7cfda558ed
commit a896d661f1

View File

@ -26,11 +26,13 @@ class TheAtlantic(BasicNewsRecipe):
keep_only_tags = [
classes(
'article-header article-body article-magazine article-cover-content lead-img'),
]
'article-header article-body article-magazine article-cover-content article-cover-extra lead-img '),
{'name': ['img']},
]
remove_tags = [
{'name': ['meta', 'link', 'noscript']},
{'attrs': {'class': ['offset-wrapper', 'ad-boxfeatures-wrapper']}},
classes( 'social-kit-top letter-writer-info callout secondary-byline embed-wrapper offset-wrapper boxtop-most-popular'),
{'name': ['meta', 'link', 'noscript', 'aside', 'h3']},
{'attrs': {'class': ['offset-wrapper', 'boxtop-most-popular']}},
{'attrs': {'class': lambda x: x and 'article-tools' in x}},
{'src': lambda x: x and 'spotxchange.com' in x},
]
@ -38,7 +40,11 @@ class TheAtlantic(BasicNewsRecipe):
no_stylesheets = True
remove_attributes = ['style']
extra_css = '''
.credit { text-align: right; font-size: 75%; display: block }
.figcaption { font-size: 75% }
.caption { font-size: 75% }
.lead-img { display: block }'''
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com')
@ -51,8 +57,8 @@ class TheAtlantic(BasicNewsRecipe):
return url + '?single_page=true'
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
for img in soup.findAll('img', attrs={'data-srcset': True}):
img['src'] = img['data-srcset']
return soup
def parse_index(self):