improvements to atlantic recipe

removed some things that were uglifying it. added imgs in addition to lead. smaller captions to distinguish from body. right-aligned credits
This commit is contained in:
jfhutson 2017-02-17 22:08:44 -06:00 committed by GitHub
parent 7cfda558ed
commit a896d661f1

View File

@ -26,19 +26,25 @@ class TheAtlantic(BasicNewsRecipe):
keep_only_tags = [ keep_only_tags = [
classes( classes(
'article-header article-body article-magazine article-cover-content lead-img'), 'article-header article-body article-magazine article-cover-content article-cover-extra lead-img '),
] {'name': ['img']},
]
remove_tags = [ remove_tags = [
{'name': ['meta', 'link', 'noscript']}, classes( 'social-kit-top letter-writer-info callout secondary-byline embed-wrapper offset-wrapper boxtop-most-popular'),
{'attrs': {'class': ['offset-wrapper', 'ad-boxfeatures-wrapper']}}, {'name': ['meta', 'link', 'noscript', 'aside', 'h3']},
{'attrs': {'class': ['offset-wrapper', 'boxtop-most-popular']}},
{'attrs': {'class': lambda x: x and 'article-tools' in x}}, {'attrs': {'class': lambda x: x and 'article-tools' in x}},
{'src': lambda x: x and 'spotxchange.com' in x}, {'src': lambda x: x and 'spotxchange.com' in x},
] ]
remove_tags_after = classes('article-body') remove_tags_after = classes('article-body')
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style'] remove_attributes = ['style']
extra_css = '''
.credit { text-align: right; font-size: 75%; display: block }
.figcaption { font-size: 75% }
.caption { font-size: 75% }
.lead-img { display: block }'''
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com') br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com')
@ -51,8 +57,8 @@ class TheAtlantic(BasicNewsRecipe):
return url + '?single_page=true' return url + '?single_page=true'
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src': True}): for img in soup.findAll('img', attrs={'data-srcset': True}):
img['src'] = img['data-src'] img['src'] = img['data-srcset']
return soup return soup
def parse_index(self): def parse_index(self):