Heise ct and iX by Ralf Hein

2025-07-31 14:33:54 -04:00 · 2020-04-15 12:34:09 +05:30 · 2020-04-15 12:34:09 +05:30 · be261dcd71
commit be261dcd71
parent a00cde1120
2 changed files with 232 additions and 0 deletions
--- a/recipes/heise_ct.recipe
+++ b/recipes/heise_ct.recipe
@ -0,0 +1,115 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+__license__ = 'GPL v3'
+__copyright__ = 'Ralf Hein - ralfhein at GMX dot DE'
+'''
+Heise Select Magazine - ct
+'''
+
+
+class heise_select(BasicNewsRecipe):
+    issue = None
+    # overwrite this for easy download of previous issues
+    # issue = '/select/ct/2020/8'
+
+    title = 'Heise ct'
+    timefmt = ''
+    __author__ = 'Ralf Hein'
+    needs_subscription = True
+    description = 'Das ct Magazin als ePaper. Benötigt Heise Plus Digitalabo (siehe https://www.heise.de/plus/)'
+    publisher = 'Heise Verlag'
+    authors = 'Heise Verlag'
+    category = 'it'
+    tags = 'Magazin, IT, computer, ct'
+    publication_type = 'magazine'
+    no_stylesheets = True
+    use_embedded_content = False
+    compress_news_images = True
+    encoding = 'utf-8'
+    language = 'de'
+
+    conversion_options = {
+        'base_font_size': 10,
+        'no_inline_navbars': True,
+        'language': language,
+        'publisher': publisher,
+        'authors': publisher
+    }
+
+    remove_tags = [
+        dict(name='meta'),
+        dict(name='link', attrs={'rel': 'icon'}),
+        dict(name='link', attrs={'rel': 'dns-prefetch'}),
+        dict(name='link', attrs={'rel': 'preconnect'}),
+        dict(name='div', attrs={'class': 'meta__group--issue'}),
+        dict(name='p', attrs={'class': 'comment'}),
+        dict(name='div', attrs={'class': 'pswp'}),
+        dict(name='div', attrs={'class': 'bottom-links'}),
+    ]
+
+    remove_tags_before = [dict(name='main')]
+    remove_tags_after = [dict(name='main')]
+
+    def parse_index(self):
+        baseref = 'https://www.heise.de'
+        # find current issue if not defined
+        if self.issue is None:
+            soup = self.index_to_soup(baseref + '/select')
+            sec = soup.find('section', attrs={'class': 'magazine--ct'})
+            self.issue = sec.find('a',
+                                  attrs={'class': 'magazine__link--issue'},
+                                  href=True)['href']
+
+        issue_num = self.issue.replace('/select/ct/', '')
+        # fix title with issue number to keep them neatly organised
+        self.title += ' ' + issue_num.replace('/', '-')
+        self.cover_url = 'https://www.heise.de/select/thumbnail/ct/' + issue_num
+
+        soup = self.index_to_soup(baseref + self.issue)
+        toc = []
+
+        for h3 in soup.findAll('h3', attrs={'class': 'xp__inhalt__title'}):
+            section_title = h3.text
+            articles = []
+            ul = h3.find_next('ul')
+
+            for li in ul.findAll('li', attrs={'class': 'xp__toc__item'}):
+                article_uri = li.find('a', attrs={'class': 'xp__link'})['href']
+                article_title = li.find('span',
+                                        attrs={
+                                            'class': 'xp__toc__item-subtitle'
+                                        }).text
+                article = {
+                    'title': article_title,
+                    'url': baseref + article_uri
+                }
+                articles.append(article)
+            toc.append((section_title, articles))
+
+        return toc
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        if self.username is not None and self.password is not None:
+            loginURL = 'https://www.heise.de/sso/login?forward=%2Fselect'
+            br.open(loginURL)
+            br.select_form(action='/sso/login/login')
+            br['username'] = self.username
+            br['password'] = self.password
+            br.submit()
+
+        return br
+
+    def preprocess_html(self, soup):
+        # images are dynamically sized via js + a-img tag, epub can not work with this
+        # construct ordinary img from it
+        for aimg in soup.findAll('a', attrs={'class': 'js-pswp-image'}):
+            if aimg['href'] is not None and aimg['data-pswp-bu'] is not None:
+                img = soup.new_tag('img',
+                                   src=aimg['href'],
+                                   alt=aimg['data-pswp-bu'],
+                                   style="display: block;")
+            if img is not None:
+                aimg.replaceWith(img)
+
+        return soup
--- a/recipes/heise_ix.recipe
+++ b/recipes/heise_ix.recipe
@ -0,0 +1,117 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+__license__ = 'GPL v3'
+__copyright__ = 'Ralf Hein - ralfhein at GMX dot DE'
+'''
+Heise Select Magazine - iX
+'''
+
+
+class heise_select(BasicNewsRecipe):
+    issue = None
+    # overwrite this for easy download of previous issues
+    # issue = '/select/ix/2020/3'
+
+    title = 'iX'
+    timefmt = ''
+    __author__ = 'Ralf Hein'
+    needs_subscription = True
+    description = 'Das iX Magazin als ePaper. Benötigt Heise Plus Digitalabo (siehe https://www.heise.de/plus/)'
+    publisher = 'Heise Verlag'
+    authors = 'Heise Verlag'
+    category = 'it'
+    tags = 'Magazin, IT, computer, ix'
+    publication_type = 'magazine'
+    no_stylesheets = True
+    use_embedded_content = False
+    compress_news_images = True
+    encoding = 'utf-8'
+    language = 'de'
+
+    conversion_options = {
+        'base_font_size': 10,
+        'no_inline_navbars': True,
+        'language': language,
+        'publisher': publisher,
+        'authors': publisher
+    }
+
+    # some code cleanup
+    remove_tags = [
+        dict(name='meta'),
+        dict(name='link', attrs={'rel': 'icon'}),
+        dict(name='link', attrs={'rel': 'dns-prefetch'}),
+        dict(name='link', attrs={'rel': 'preconnect'}),
+        dict(name='div', attrs={'class': 'meta__group--issue'}),
+        dict(name='p', attrs={'class': 'comment'}),
+        dict(name='div', attrs={'class': 'pswp'}),
+        dict(name='div', attrs={'class': 'bottom-links'}),
+    ]
+
+    # content is neatly within <main> element
+    remove_tags_before = [dict(name='main')]
+    remove_tags_after = [dict(name='main')]
+
+    def parse_index(self):
+        baseref = 'https://www.heise.de'
+        # find current issue if not defined
+        if self.issue is None:
+            soup = self.index_to_soup(baseref + '/select')
+            sec = soup.find('section', attrs={'class': 'magazine--ix'})
+            self.issue = sec.find(
+                'a', attrs={'class': 'magazine__link--issue'}, href=True
+            )['href']
+
+        issue_num = self.issue.replace('/select/ix/', '')
+        # fix title with issue number to keep them neatly organised
+        self.title += ' ' + issue_num.replace('/', '-')
+        self.cover_url = 'https://www.heise.de/select/thumbnail/ix/' + issue_num
+
+        soup = self.index_to_soup(baseref + self.issue)
+        toc = []
+
+        for h3 in soup.findAll('h3', attrs={'class': 'xp__inhalt__title'}):
+            section_title = h3.text
+            articles = []
+            ul = h3.find_next('ul')
+
+            for li in ul.findAll('li', attrs={'class': 'xp__toc__item'}):
+                article_uri = li.find('a', attrs={'class': 'xp__link'})['href']
+                article_title = li.find(
+                    'span', attrs={
+                        'class': 'xp__toc__item-subtitle'
+                    }
+                ).text
+                article = {'title': article_title, 'url': baseref + article_uri}
+                articles.append(article)
+            toc.append((section_title, articles))
+
+        return toc
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        if self.username is not None and self.password is not None:
+            loginURL = 'https://www.heise.de/sso/login?forward=%2Fselect'
+            br.open(loginURL)
+            br.select_form(action='/sso/login/login')
+            br['username'] = self.username
+            br['password'] = self.password
+            br.submit()
+
+        return br
+
+    def preprocess_html(self, soup):
+        # images are dynamically sized via js + a-img tag, epub can not work with this
+        # construct ordinary img from it
+        for aimg in soup.findAll('a', attrs={'class': 'js-pswp-image'}):
+            if aimg['href'] is not None and aimg['data-pswp-bu'] is not None:
+                img = soup.new_tag(
+                    'img',
+                    src=aimg['href'],
+                    alt=aimg['data-pswp-bu'],
+                    style="display: block;"
+                )
+            if img is not None:
+                aimg.replaceWith(img)
+
+        return soup