calibre/recipes/heise_ix.recipe

from calibre.web.feeds.recipes import BasicNewsRecipe

__license__ = 'GPL v3'
__copyright__ = 'Ralf Hein - ralfhein at GMX dot DE'
'''
Heise Select Magazine - iX
'''


class heise_select(BasicNewsRecipe):
    issue = None
    # overwrite this for easy download of previous issues
    # issue = '/select/ix/2020/3'

    title = 'iX'
    timefmt = ''
    __author__ = 'Ralf Hein'
    needs_subscription = True
    description = 'Das iX Magazin als ePaper. Benötigt Heise Plus Digitalabo (siehe https://www.heise.de/plus/)'
    publisher = 'Heise Verlag'
    authors = 'Heise Verlag'
    category = 'it'
    tags = 'Magazin, IT, computer, ix'
    publication_type = 'magazine'
    no_stylesheets = True
    use_embedded_content = False
    compress_news_images = True
    encoding = 'utf-8'
    language = 'de'

    conversion_options = {
        'base_font_size': 10,
        'no_inline_navbars': True,
        'language': language,
        'publisher': publisher,
        'authors': publisher
    }

    # some code cleanup
    remove_tags = [
        dict(name='meta'),
        dict(name='link', attrs={'rel': 'icon'}),
        dict(name='link', attrs={'rel': 'dns-prefetch'}),
        dict(name='link', attrs={'rel': 'preconnect'}),
        dict(name='div', attrs={'class': 'meta__group--issue'}),
        dict(name='p', attrs={'class': 'comment'}),
        dict(name='div', attrs={'class': 'pswp'}),
        dict(name='div', attrs={'class': 'bottom-links'}),
    ]

    # content is neatly within <main> element
    remove_tags_before = [dict(name='main')]
    remove_tags_after = [dict(name='main')]

    def parse_index(self):
        baseref = 'https://www.heise.de'
        # find current issue if not defined
        if self.issue is None:
            soup = self.index_to_soup(baseref + '/select')
            sec = soup.find('section', attrs={'class': 'magazine--ix'})
            self.issue = sec.find(
                'a', attrs={'class': 'magazine__link--issue'}, href=True
            )['href']

        issue_num = self.issue.replace('/select/ix/', '')
        # fix title with issue number to keep them neatly organised
        self.title += ' ' + issue_num.replace('/', '-')
        self.cover_url = 'https://www.heise.de/select/thumbnail/ix/' + issue_num

        soup = self.index_to_soup(baseref + self.issue)
        toc = []

        for h3 in soup.findAll('h3', attrs={'class': 'xp__inhalt__title'}):
            section_title = h3.text
            articles = []
            ul = h3.find_next('ul')

            for li in ul.findAll('li', attrs={'class': 'xp__toc__item'}):
                article_uri = li.find('a', attrs={'class': 'xp__link'})['href']
                article_title = li.find(
                    'span', attrs={
                        'class': 'xp__toc__item-subtitle'
                    }
                ).text
                article = {'title': article_title, 'url': baseref + article_uri}
                articles.append(article)
            toc.append((section_title, articles))

        return toc

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            loginURL = 'https://www.heise.de/sso/login?forward=%2Fselect'
            br.open(loginURL)
            br.select_form(action='/sso/login/login')
            br['username'] = self.username
            br['password'] = self.password
            br.submit()

        return br

    def preprocess_html(self, soup):
        # images are dynamically sized via js + a-img tag, epub can not work with this
        # construct ordinary img from it
        for aimg in soup.findAll('a', attrs={'class': 'js-pswp-image'}):
            if aimg['href'] is not None and aimg['data-pswp-bu'] is not None:
                img = soup.new_tag(
                    'img',
                    src=aimg['href'],
                    alt=aimg['data-pswp-bu'],
                    style="display: block;"
                )
            if img is not None:
                aimg.replaceWith(img)

        return soup