From be261dcd712de312364b27bdf592d26396816725 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Apr 2020 12:34:09 +0530 Subject: [PATCH] Heise ct and iX by Ralf Hein --- recipes/heise_ct.recipe | 115 +++++++++++++++++++++++++++++++++++++++ recipes/heise_ix.recipe | 117 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 recipes/heise_ct.recipe create mode 100644 recipes/heise_ix.recipe diff --git a/recipes/heise_ct.recipe b/recipes/heise_ct.recipe new file mode 100644 index 0000000000..66ec19a02b --- /dev/null +++ b/recipes/heise_ct.recipe @@ -0,0 +1,115 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +__license__ = 'GPL v3' +__copyright__ = 'Ralf Hein - ralfhein at GMX dot DE' +''' +Heise Select Magazine - ct +''' + + +class heise_select(BasicNewsRecipe): + issue = None + # overwrite this for easy download of previous issues + # issue = '/select/ct/2020/8' + + title = 'Heise ct' + timefmt = '' + __author__ = 'Ralf Hein' + needs_subscription = True + description = 'Das ct Magazin als ePaper. Benötigt Heise Plus Digitalabo (siehe https://www.heise.de/plus/)' + publisher = 'Heise Verlag' + authors = 'Heise Verlag' + category = 'it' + tags = 'Magazin, IT, computer, ct' + publication_type = 'magazine' + no_stylesheets = True + use_embedded_content = False + compress_news_images = True + encoding = 'utf-8' + language = 'de' + + conversion_options = { + 'base_font_size': 10, + 'no_inline_navbars': True, + 'language': language, + 'publisher': publisher, + 'authors': publisher + } + + remove_tags = [ + dict(name='meta'), + dict(name='link', attrs={'rel': 'icon'}), + dict(name='link', attrs={'rel': 'dns-prefetch'}), + dict(name='link', attrs={'rel': 'preconnect'}), + dict(name='div', attrs={'class': 'meta__group--issue'}), + dict(name='p', attrs={'class': 'comment'}), + dict(name='div', attrs={'class': 'pswp'}), + dict(name='div', attrs={'class': 'bottom-links'}), + ] + + remove_tags_before = [dict(name='main')] + remove_tags_after = [dict(name='main')] + + def parse_index(self): + baseref = 'https://www.heise.de' + # find current issue if not defined + if self.issue is None: + soup = self.index_to_soup(baseref + '/select') + sec = soup.find('section', attrs={'class': 'magazine--ct'}) + self.issue = sec.find('a', + attrs={'class': 'magazine__link--issue'}, + href=True)['href'] + + issue_num = self.issue.replace('/select/ct/', '') + # fix title with issue number to keep them neatly organised + self.title += ' ' + issue_num.replace('/', '-') + self.cover_url = 'https://www.heise.de/select/thumbnail/ct/' + issue_num + + soup = self.index_to_soup(baseref + self.issue) + toc = [] + + for h3 in soup.findAll('h3', attrs={'class': 'xp__inhalt__title'}): + section_title = h3.text + articles = [] + ul = h3.find_next('ul') + + for li in ul.findAll('li', attrs={'class': 'xp__toc__item'}): + article_uri = li.find('a', attrs={'class': 'xp__link'})['href'] + article_title = li.find('span', + attrs={ + 'class': 'xp__toc__item-subtitle' + }).text + article = { + 'title': article_title, + 'url': baseref + article_uri + } + articles.append(article) + toc.append((section_title, articles)) + + return toc + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username is not None and self.password is not None: + loginURL = 'https://www.heise.de/sso/login?forward=%2Fselect' + br.open(loginURL) + br.select_form(action='/sso/login/login') + br['username'] = self.username + br['password'] = self.password + br.submit() + + return br + + def preprocess_html(self, soup): + # images are dynamically sized via js + a-img tag, epub can not work with this + # construct ordinary img from it + for aimg in soup.findAll('a', attrs={'class': 'js-pswp-image'}): + if aimg['href'] is not None and aimg['data-pswp-bu'] is not None: + img = soup.new_tag('img', + src=aimg['href'], + alt=aimg['data-pswp-bu'], + style="display: block;") + if img is not None: + aimg.replaceWith(img) + + return soup diff --git a/recipes/heise_ix.recipe b/recipes/heise_ix.recipe new file mode 100644 index 0000000000..dadbf1ac94 --- /dev/null +++ b/recipes/heise_ix.recipe @@ -0,0 +1,117 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +__license__ = 'GPL v3' +__copyright__ = 'Ralf Hein - ralfhein at GMX dot DE' +''' +Heise Select Magazine - iX +''' + + +class heise_select(BasicNewsRecipe): + issue = None + # overwrite this for easy download of previous issues + # issue = '/select/ix/2020/3' + + title = 'iX' + timefmt = '' + __author__ = 'Ralf Hein' + needs_subscription = True + description = 'Das iX Magazin als ePaper. Benötigt Heise Plus Digitalabo (siehe https://www.heise.de/plus/)' + publisher = 'Heise Verlag' + authors = 'Heise Verlag' + category = 'it' + tags = 'Magazin, IT, computer, ix' + publication_type = 'magazine' + no_stylesheets = True + use_embedded_content = False + compress_news_images = True + encoding = 'utf-8' + language = 'de' + + conversion_options = { + 'base_font_size': 10, + 'no_inline_navbars': True, + 'language': language, + 'publisher': publisher, + 'authors': publisher + } + + # some code cleanup + remove_tags = [ + dict(name='meta'), + dict(name='link', attrs={'rel': 'icon'}), + dict(name='link', attrs={'rel': 'dns-prefetch'}), + dict(name='link', attrs={'rel': 'preconnect'}), + dict(name='div', attrs={'class': 'meta__group--issue'}), + dict(name='p', attrs={'class': 'comment'}), + dict(name='div', attrs={'class': 'pswp'}), + dict(name='div', attrs={'class': 'bottom-links'}), + ] + + # content is neatly within
element + remove_tags_before = [dict(name='main')] + remove_tags_after = [dict(name='main')] + + def parse_index(self): + baseref = 'https://www.heise.de' + # find current issue if not defined + if self.issue is None: + soup = self.index_to_soup(baseref + '/select') + sec = soup.find('section', attrs={'class': 'magazine--ix'}) + self.issue = sec.find( + 'a', attrs={'class': 'magazine__link--issue'}, href=True + )['href'] + + issue_num = self.issue.replace('/select/ix/', '') + # fix title with issue number to keep them neatly organised + self.title += ' ' + issue_num.replace('/', '-') + self.cover_url = 'https://www.heise.de/select/thumbnail/ix/' + issue_num + + soup = self.index_to_soup(baseref + self.issue) + toc = [] + + for h3 in soup.findAll('h3', attrs={'class': 'xp__inhalt__title'}): + section_title = h3.text + articles = [] + ul = h3.find_next('ul') + + for li in ul.findAll('li', attrs={'class': 'xp__toc__item'}): + article_uri = li.find('a', attrs={'class': 'xp__link'})['href'] + article_title = li.find( + 'span', attrs={ + 'class': 'xp__toc__item-subtitle' + } + ).text + article = {'title': article_title, 'url': baseref + article_uri} + articles.append(article) + toc.append((section_title, articles)) + + return toc + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username is not None and self.password is not None: + loginURL = 'https://www.heise.de/sso/login?forward=%2Fselect' + br.open(loginURL) + br.select_form(action='/sso/login/login') + br['username'] = self.username + br['password'] = self.password + br.submit() + + return br + + def preprocess_html(self, soup): + # images are dynamically sized via js + a-img tag, epub can not work with this + # construct ordinary img from it + for aimg in soup.findAll('a', attrs={'class': 'js-pswp-image'}): + if aimg['href'] is not None and aimg['data-pswp-bu'] is not None: + img = soup.new_tag( + 'img', + src=aimg['href'], + alt=aimg['data-pswp-bu'], + style="display: block;" + ) + if img is not None: + aimg.replaceWith(img) + + return soup