Fix recipe for nature.com

This commit is contained in:
ping 2022-06-10 12:21:48 +08:00
parent 609b431c91
commit ef487db79c
No known key found for this signature in database
GPG Key ID: 6CCF56BCEDD24084

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
from collections import defaultdict from collections import defaultdict
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, classes
BASE = 'https://www.nature.com' BASE = 'https://www.nature.com'
@ -39,11 +39,15 @@ class Nature(BasicNewsRecipe):
no_javascript = True no_javascript = True
no_stylesheets = True no_stylesheets = True
keep_only_tags = [ keep_only_tags = [dict(name="article")]
dict(name='div', attrs={'data-component': check_words('article-container')})
]
remove_tags = [dict(attrs={'class': check_words('hide-print')})] remove_tags = [
classes(
"u-hide-print hide-print c-latest-content__item c-context-bar "
"c-pdf-button__container u-js-hide"
),
dict(name="img", attrs={"class": ["visually-hidden"]}),
]
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(BASE + '/nature/current-issue') soup = self.index_to_soup(BASE + '/nature/current-issue')
@ -57,11 +61,9 @@ class Nature(BasicNewsRecipe):
failed, img src might have changed, use default width 200 failed, img src might have changed, use default width 200
""" """
pass pass
section_tags = soup.find(
'div', {'data-container-type': check_words('issue-section-list')} section_tags = soup.find_all(
) "section", attrs={"data-container-type": "issue-section-list"}
section_tags = section_tags.findAll(
'div', {'class': check_words('article-section')}
) )
sections = defaultdict(list) sections = defaultdict(list)