calibre/recipes/nature.recipe

#!/usr/bin/env python
import re
from collections import defaultdict

from calibre.web.feeds.news import BasicNewsRecipe, classes

BASE = 'https://www.nature.com'


def absurl(url):
    if url.startswith('/'):
        url = BASE + url
    elif url.startswith('http://'):
        url = 'https' + url[4:]
    return url


def check_words(words):
    return lambda x: x and frozenset(words.split()).intersection(x.split())


def has_all_of(words):
    return lambda x: x and frozenset(words.split()).issubset(x.split())


class Nature(BasicNewsRecipe):
    title = 'Nature'
    __author__ = 'Jose Ortiz'
    description = (
        'Nature is a weekly international multidisciplinary scientific journal'
        ' publishing peer-reviewed research in all fields of science and'
        ' technology on the basis of its originality, importance,'
        ' interdisciplinary interest, timeliness, accessibility, elegance and'
        ' surprising conclusions. Nature also provides rapid, authoritative,'
        ' insightful and arresting news and interpretation of topical and coming'
        ' trends affecting science, scientists and the wider public.'
    )
    language = 'en'
    encoding = 'UTF-8'
    no_javascript = True
    no_stylesheets = True

    keep_only_tags = [dict(name='article')]

    remove_tags = [
        classes(
            'u-hide-print hide-print c-latest-content__item c-context-bar '
            'c-pdf-button__container u-js-hide'
        ),
        dict(name='img', attrs={'class': ['visually-hidden']}),
    ]

    def parse_index(self):
        soup = self.index_to_soup(BASE + '/nature/current-issue')
        self.cover_url = soup.find(
            'img', attrs={'data-test': check_words('issue-cover-image')}
        )['src']
        try:
            self.cover_url = re.sub(r'\bw\d+\b', 'w1000', self.cover_url)  # enlarge cover size resolution
        except Exception:
            '''
            failed, img src might have changed, use default width 200
            '''
            pass

        section_tags = soup.find_all(
            'section', attrs={'data-container-type': 'issue-section-list'}
        )

        sections = defaultdict(list)
        ordered_sec_titles = []
        index = []

        for sec in section_tags:
            sec_title = self.tag_to_string(sec.find('h2'))
            ordered_sec_titles.append(sec_title)
            for article in sec.findAll('article'):
                try:
                    url = absurl(
                        article.find('a', {'itemprop': check_words('url')})['href']
                    )
                except TypeError:
                    continue
                title = self.tag_to_string(
                    article.find('h3', {'itemprop': has_all_of('name headline')})
                )
                date = ' [' + self.tag_to_string(
                    article.find('time', {'itemprop': check_words('datePublished')})
                ) + ']'
                author = self.tag_to_string(
                    article.find('li', {'itemprop': check_words('creator')})
                )
                description = self.tag_to_string(
                    article.find(attrs={'data-test': check_words('article.type')})
                ) + u' • '
                description += self.tag_to_string(
                    article.find(
                        'div', attrs={'itemprop': check_words('description')}
                    )
                )
                sections[sec_title].append({
                    'title': title,
                    'url': url,
                    'description': description,
                    'date': date,
                    'author': author
                })

        for k in ordered_sec_titles:
            index.append((k, sections[k]))
        return index

    def preprocess_html(self, soup):
        for img in soup.findAll('img', {'data-src': True}):
            if img['data-src'].startswith('//'):
                img['src'] = 'https:' + img['data-src']
            else:
                img['src'] = img['data-src']
        for div in soup.findAll(
            'div', {'data-component': check_words('article-container')}
        )[1:]:
            div.extract()
        return soup