from calibre.web.feeds.recipes import BasicNewsRecipe __license__ = 'GPL v3' __copyright__ = 'Ralf Hein - ralfhein at GMX dot DE' ''' Heise Select Magazine - iX ''' class heise_select(BasicNewsRecipe): issue = None # overwrite this for easy download of previous issues # issue = '/select/ix/2020/3' title = 'iX' timefmt = '' __author__ = 'Ralf Hein' needs_subscription = True description = 'Das iX Magazin als ePaper. Benötigt Heise Plus Digitalabo (siehe https://www.heise.de/plus/)' publisher = 'Heise Verlag' authors = 'Heise Verlag' category = 'it' tags = 'Magazin, IT, computer, ix' publication_type = 'magazine' no_stylesheets = True use_embedded_content = False compress_news_images = True encoding = 'utf-8' language = 'de' conversion_options = { 'base_font_size': 10, 'no_inline_navbars': True, 'language': language, 'publisher': publisher, 'authors': publisher } # some code cleanup remove_tags = [ dict(name='meta'), dict(name='link', attrs={'rel': 'icon'}), dict(name='link', attrs={'rel': 'dns-prefetch'}), dict(name='link', attrs={'rel': 'preconnect'}), dict(name='div', attrs={'class': 'meta__group--issue'}), dict(name='p', attrs={'class': 'comment'}), dict(name='div', attrs={'class': 'pswp'}), dict(name='div', attrs={'class': 'bottom-links'}), ] # content is neatly within
element remove_tags_before = [dict(name='main')] remove_tags_after = [dict(name='main')] def parse_index(self): baseref = 'https://www.heise.de' # find current issue if not defined if self.issue is None: soup = self.index_to_soup(baseref + '/select') sec = soup.find('section', attrs={'class': 'magazine--ix'}) self.issue = sec.find( 'a', attrs={'class': 'magazine__link--issue'}, href=True )['href'] issue_num = self.issue.replace('/select/ix/', '') # fix title with issue number to keep them neatly organised self.title += ' ' + issue_num.replace('/', '-') self.cover_url = 'https://www.heise.de/select/thumbnail/ix/' + issue_num soup = self.index_to_soup(baseref + self.issue) toc = [] for h3 in soup.findAll('h3', attrs={'class': 'xp__inhalt__title'}): section_title = h3.text articles = [] ul = h3.find_next('ul') for li in ul.findAll('li', attrs={'class': 'xp__toc__item'}): article_uri = li.find('a', attrs={'class': 'xp__link'})['href'] article_title = li.find( 'span', attrs={ 'class': 'xp__toc__item-subtitle' } ).text article = {'title': article_title, 'url': baseref + article_uri} articles.append(article) toc.append((section_title, articles)) return toc def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: loginURL = 'https://www.heise.de/sso/login?forward=%2Fselect' br.open(loginURL) br.select_form(action='/sso/login/login') br['username'] = self.username br['password'] = self.password br.submit() return br def preprocess_html(self, soup): # images are dynamically sized via js + a-img tag, epub can not work with this # construct ordinary img from it for aimg in soup.findAll('a', attrs={'class': 'js-pswp-image'}): if aimg['href'] is not None and aimg['data-pswp-bu'] is not None: img = soup.new_tag( 'img', src=aimg['href'], alt=aimg['data-pswp-bu'], style="display: block;" ) if img is not None: aimg.replaceWith(img) return soup