Update The Atlantic

This commit is contained in:
Kovid Goyal 2020-10-16 22:25:21 +05:30
parent 64e2b05b5c
commit 0e443392d7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 20 additions and 8 deletions

View File

@ -33,16 +33,20 @@ class TheAtlantic(BasicNewsRecipe):
encoding = 'utf-8' encoding = 'utf-8'
keep_only_tags = [ keep_only_tags = [
dict(id='rubric'), dict(itemprop=['headline']),
dict(itemprop=['headline', 'image']),
classes( classes(
'article-header c-article-meta lead-img article-cover-extra article-body article-magazine article-cover-content' 'c-article-header__hed c-rubric article-header c-article-meta c-lead-media'
' lead-img article-cover-extra article-body article-magazine article-cover-content'
), ),
dict(itemprop='articleBody'), dict(itemprop='articleBody'),
# these are for photos articles
dict(id='article-header'),
classes('photos'),
] ]
remove_tags = [ remove_tags = [
classes( classes(
'c-ad social-kit-top letter-writer-info callout secondary-byline embed-wrapper offset-wrapper boxtop-most-popular' 'c-ad c-share-social social-kit-top letter-writer-info callout secondary-byline embed-wrapper'
' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social'
), ),
{ {
'name': ['meta', 'link', 'noscript', 'aside', 'h3'] 'name': ['meta', 'link', 'noscript', 'aside', 'h3']
@ -84,6 +88,8 @@ class TheAtlantic(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-srcset': True}): for img in soup.findAll('img', attrs={'data-srcset': True}):
img['src'] = img['data-srcset'].split()[0] img['src'] = img['data-srcset'].split()[0]
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
return soup return soup
def print_version(self, url): def print_version(self, url):

View File

@ -33,16 +33,20 @@ class TheAtlantic(BasicNewsRecipe):
encoding = 'utf-8' encoding = 'utf-8'
keep_only_tags = [ keep_only_tags = [
dict(id='rubric'), dict(itemprop=['headline']),
dict(itemprop=['headline', 'image']),
classes( classes(
'c-article-header__hed article-header c-article-meta lead-img article-cover-extra article-body article-magazine article-cover-content' 'c-article-header__hed c-rubric article-header c-article-meta c-lead-media'
' lead-img article-cover-extra article-body article-magazine article-cover-content'
), ),
dict(itemprop='articleBody'), dict(itemprop='articleBody'),
# these are for photos articles
dict(id='article-header'),
classes('photos'),
] ]
remove_tags = [ remove_tags = [
classes( classes(
'c-ad social-kit-top letter-writer-info callout secondary-byline embed-wrapper offset-wrapper boxtop-most-popular' 'c-ad c-share-social social-kit-top letter-writer-info callout secondary-byline embed-wrapper'
' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social'
), ),
{ {
'name': ['meta', 'link', 'noscript', 'aside', 'h3'] 'name': ['meta', 'link', 'noscript', 'aside', 'h3']
@ -84,6 +88,8 @@ class TheAtlantic(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-srcset': True}): for img in soup.findAll('img', attrs={'data-srcset': True}):
img['src'] = img['data-srcset'].split()[0] img['src'] = img['data-srcset'].split()[0]
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
return soup return soup
def print_version(self, url): def print_version(self, url):