From 480cc884fe36efcb816e32ea0d3914c1c558c979 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 18 Dec 2013 09:47:45 +0530 Subject: [PATCH] Update NZZ --- recipes/nzz_ger.recipe | 12 ++++---- recipes/nzz_webpaper.recipe | 56 +++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/recipes/nzz_ger.recipe b/recipes/nzz_ger.recipe index 001c8075da..3430cfa76a 100644 --- a/recipes/nzz_ger.recipe +++ b/recipes/nzz_ger.recipe @@ -1,4 +1,3 @@ - __license__ = 'GPL v3' __copyright__ = '2009-2010, Darko Miletic , 2012 Bernd Leinfelder ' @@ -25,11 +24,11 @@ class Nzz(BasicNewsRecipe): .artikel h3,.artikel h4,.bildLegende,.question,.autor{font-family: Arial,Verdana,Helvetica,sans-serif} .bildLegende{font-size: small} .autor{font-size: 0.9375em; color: #666666} - .quote{font-size: large !important; - font-style: italic; - font-weight: normal !important; - border-bottom: 1px dotted #BFBFBF; - border-top: 1px dotted #BFBFBF; + .quote{font-size: large !important; + font-style: italic; + font-weight: normal !important; + border-bottom: 1px dotted #BFBFBF; + border-top: 1px dotted #BFBFBF; line-height: 1.25em} .quelle{color: #666666; font-style: italic; white-space: nowrap} """ @@ -41,7 +40,6 @@ class Nzz(BasicNewsRecipe): ,'publisher' : publisher } - remove_attributes=['width','height','lang'] remove_tags_before = dict(id='main') remove_tags_after = dict(id='articleBodyText') diff --git a/recipes/nzz_webpaper.recipe b/recipes/nzz_webpaper.recipe index da86de3588..2e1669313c 100644 --- a/recipes/nzz_webpaper.recipe +++ b/recipes/nzz_webpaper.recipe @@ -1,3 +1,4 @@ +import re from calibre import strftime __license__ = 'GPL v3' @@ -7,6 +8,7 @@ __copyright__ = '2012, Bernd Leinfelder ' webpaper.nzz.ch ''' +from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.recipes import BasicNewsRecipe class Nzz(BasicNewsRecipe): @@ -23,6 +25,7 @@ class Nzz(BasicNewsRecipe): encoding = 'utf-8' use_embedded_content = False language = 'de' + temp_files = [] extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' conversion_options = { @@ -32,7 +35,7 @@ class Nzz(BasicNewsRecipe): ,'publisher' : publisher } - remove_tags = [dict(name='footer')] + remove_tags = [dict(name='footer') , dict({'class' : 'fullarticle__related'})] remove_tags_before = dict(name='article') remove_tags_after= dict(name='footer') @@ -41,47 +44,52 @@ class Nzz(BasicNewsRecipe): baseref = 'https://webpaper.nzz.ch' soup = self.index_to_soup(baseref) + # print soup.prettify() + articles = {} + sections = [] ans = [] + issue = soup.find("link",rel="prefetch") - issuelist = soup.find(id="issueSelectorList") + soup = self.index_to_soup(baseref+issue['href']) + # print soup.prettify() + section = "" + lastsection = "" + pubdate = strftime('%a, %d %b') - feeds = issuelist.findAll("a") - for f in feeds: - section = f.string - sectionref = baseref + f['href'] + articlesoup = soup.findAll("article",{"class" : re.compile(".*fullarticle[ \"].*")}) + for art in articlesoup: + # print art.prettify() + section=art['data-department'] + print "section is "+section - ans.append(section) + if section != lastsection: + sections.append(section) + articles[section]=[] + lastsection=section - articlesoup = self.index_to_soup(sectionref) + caption = art.find("h2") - articlesoup = articlesoup.findAll('article','article') - for a in articlesoup: - artlink = a.find('a') + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write(art.prettify()) + self.temp_files[-1].close() + filename = self.temp_files[-1].name - arthref = baseref + artlink['href'] - arthead = a.find('h2') - artcaption = arthead.string + articles[section].append( + dict(title=caption.string,url='file://'+filename, date=pubdate, description='', content='')) - pubdate = strftime('%a, %d %b') + ans = [(key, articles[key]) for key in sections if key in articles] - if not artcaption is None: - if not articles.has_key(section): - articles[section] = [] - articles[section].append( - dict(title=artcaption, url=arthref, date=pubdate, description='', content='')) + # pprint.pprint(ans) - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: - br.open('https://cas.nzz.ch/cas/login') + br.open('https://webpaper.nzz.ch/login') br.select_form(nr=0) br['username'] = self.username br['password'] = self.password br.submit() return br - -