import re from calibre import strftime from calibre.ebooks.BeautifulSoup import Tag __license__ = 'GPL v3' __copyright__ = '2012-15, Bernd Leinfelder ' ''' webpaper.nzz.ch ''' from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.recipes import BasicNewsRecipe class Nzz(BasicNewsRecipe): title = 'NZZ Webpaper' __author__ = 'Bernd Leinfelder' description = 'Neue Zuercher Zeitung Webpaper - Erfordert NZZ Digital Abonnement. v20140425' timefmt = ' [%a, %d %b, %Y]' publisher = 'NZZ AG' needs_subscription = True category = 'news, politics, nachrichten, Switzerland' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False language = 'de' temp_files = [] extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' conversion_options = { 'comments': description, 'tags': category, 'language': language, 'publisher': publisher } remove_tags = [dict(name='footer'), dict( {'class': ['sharebox', 'fullarticle__related']})] remove_tags_before = dict(name='article') remove_tags_after = dict(name='footer') def parse_index(self): baseref = 'https://webpaper.nzz.ch' soup = self.index_to_soup(baseref) # print soup.prettify() articles = {} sections = [] ans = [] issue = soup.find("link", rel="prefetch") soup = self.index_to_soup(baseref + issue['href']) for span in soup.findAll('span', attrs={'data-src-640': True}): imgSrc = span['data-src-640'] # print "image source: "+ imgSrc imgTag = Tag(soup, "img", [("src", imgSrc)]) span.replaceWith(imgTag) # print soup.prettify() section = "" lastsection = "" pubdate = strftime('%a, %d %b') articlesoup = soup.findAll( "article", {"class": re.compile(".*fullarticle[ \"].*")}) for art in articlesoup: # print art.prettify() section = art['data-department'] # print "section is "+section if section != lastsection: sections.append(section) articles[section] = [] lastsection = section caption = art.find("h2") self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write("" + art.prettify() + "") self.temp_files[-1].close() filename = self.temp_files[-1].name articles[section].append( dict(title=caption.string, url='file://' + filename, date=pubdate, description='', content='')) ans = [(key, articles[key]) for key in sections if key in articles] # pprint.pprint(ans) return ans def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://webpaper.nzz.ch/login') br.select_form(nr=0) br['username'] = self.username br['password'] = self.password br.submit() return br