Update The Atlantic

This commit is contained in:
Kovid Goyal 2020-12-23 10:19:28 +05:30
parent d4c2b82582
commit c9193e3e53
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 46 additions and 4 deletions

View File

@ -9,6 +9,8 @@ from calibre.web.feeds.news import BasicNewsRecipe
web_version = False web_version = False
test_article = None
# test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed'
def classes(classes): def classes(classes):
@ -18,6 +20,19 @@ def classes(classes):
) )
def prefix_classes(classes):
q = classes.split()
def test(x):
if x:
for cls in x.split():
for c in q:
if cls.startswith(c):
return True
return False
return dict(attrs={'class': test})
class TheAtlantic(BasicNewsRecipe): class TheAtlantic(BasicNewsRecipe):
if web_version: if web_version:
@ -38,6 +53,9 @@ class TheAtlantic(BasicNewsRecipe):
'c-article-header__hed c-rubric article-header c-article-meta c-lead-media' 'c-article-header__hed c-rubric article-header c-article-meta c-lead-media'
' lead-img article-cover-extra article-body article-magazine article-cover-content' ' lead-img article-cover-extra article-body article-magazine article-cover-content'
), ),
prefix_classes(
'ArticleHeader_root__ ArticleLayoutSection_main__'
),
dict(itemprop='articleBody'), dict(itemprop='articleBody'),
# these are for photos articles # these are for photos articles
dict(id='article-header'), dict(id='article-header'),
@ -45,9 +63,10 @@ class TheAtlantic(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
classes( classes(
'c-ad c-share-social social-kit-top letter-writer-info callout secondary-byline embed-wrapper' 'c-ad c-share-social c-recirculation-link social-kit-top letter-writer-info callout secondary-byline embed-wrapper'
' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social' ' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social'
), ),
prefix_classes('ArticleRecirc_inline__'),
{ {
'name': ['meta', 'link', 'noscript', 'aside', 'h3'] 'name': ['meta', 'link', 'noscript', 'aside', 'h3']
}, },
@ -103,7 +122,7 @@ class TheAtlantic(BasicNewsRecipe):
ans = None ans = None
return ans return ans
if web_version: if web_version and not test_article:
use_embedded_content = False use_embedded_content = False
@ -129,6 +148,8 @@ class TheAtlantic(BasicNewsRecipe):
] ]
else: else:
def parse_index(self): def parse_index(self):
if test_article:
return [('Articles', [{'title': 'Test article', 'url': test_article}])]
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
figure = soup.find('figure', id='cover-image') figure = soup.find('figure', id='cover-image')
if figure is not None: if figure is not None:

View File

@ -9,6 +9,8 @@ from calibre.web.feeds.news import BasicNewsRecipe
web_version = True web_version = True
test_article = None
# test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed'
def classes(classes): def classes(classes):
@ -18,6 +20,19 @@ def classes(classes):
) )
def prefix_classes(classes):
q = classes.split()
def test(x):
if x:
for cls in x.split():
for c in q:
if cls.startswith(c):
return True
return False
return dict(attrs={'class': test})
class TheAtlantic(BasicNewsRecipe): class TheAtlantic(BasicNewsRecipe):
if web_version: if web_version:
@ -38,6 +53,9 @@ class TheAtlantic(BasicNewsRecipe):
'c-article-header__hed c-rubric article-header c-article-meta c-lead-media' 'c-article-header__hed c-rubric article-header c-article-meta c-lead-media'
' lead-img article-cover-extra article-body article-magazine article-cover-content' ' lead-img article-cover-extra article-body article-magazine article-cover-content'
), ),
prefix_classes(
'ArticleHeader_root__ ArticleLayoutSection_main__'
),
dict(itemprop='articleBody'), dict(itemprop='articleBody'),
# these are for photos articles # these are for photos articles
dict(id='article-header'), dict(id='article-header'),
@ -45,9 +63,10 @@ class TheAtlantic(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
classes( classes(
'c-ad c-share-social social-kit-top letter-writer-info callout secondary-byline embed-wrapper' 'c-ad c-share-social c-recirculation-link social-kit-top letter-writer-info callout secondary-byline embed-wrapper'
' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social' ' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social'
), ),
prefix_classes('ArticleRecirc_inline__'),
{ {
'name': ['meta', 'link', 'noscript', 'aside', 'h3'] 'name': ['meta', 'link', 'noscript', 'aside', 'h3']
}, },
@ -103,7 +122,7 @@ class TheAtlantic(BasicNewsRecipe):
ans = None ans = None
return ans return ans
if web_version: if web_version and not test_article:
use_embedded_content = False use_embedded_content = False
@ -129,6 +148,8 @@ class TheAtlantic(BasicNewsRecipe):
] ]
else: else:
def parse_index(self): def parse_index(self):
if test_article:
return [('Articles', [{'title': 'Test article', 'url': test_article}])]
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
figure = soup.find('figure', id='cover-image') figure = soup.find('figure', id='cover-image')
if figure is not None: if figure is not None: