diff --git a/resources/recipes/zeitde.recipe b/resources/recipes/zeitde.recipe index df9c647f10..a86359c068 100644 --- a/resources/recipes/zeitde.recipe +++ b/resources/recipes/zeitde.recipe @@ -6,88 +6,105 @@ Fetch Die Zeit. ''' from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.ebooks.BeautifulSoup import Tag class ZeitDe(BasicNewsRecipe): - title = 'Die Zeit Nachrichten' - description = 'Die Zeit - Online Nachrichten' + title = 'ZEIT Online Reader Edition' + description = 'ZEIT Online' language = 'de' lang = 'de_DE' - __author__ = 'Martin Pitt and Sujata Raman' + __author__ = 'Martin Pitt, Sujata Raman and Ingo Paschke' use_embedded_content = False - max_articles_per_feed = 40 + max_articles_per_feed = 100 remove_empty_feeds = True no_stylesheets = True + no_javascript = True encoding = 'utf-8' - + delay = 0 feeds = [ - ('Politik', 'http://newsfeed.zeit.de/politik/index'), - ('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'), - ('Meinung', 'http://newsfeed.zeit.de/meinung/index'), - ('Gesellschaft', 'http://newsfeed.zeit.de/gesellschaft/index'), - ('Kultur', 'http://newsfeed.zeit.de/kultur/index'), - ('Wissen', 'http://newsfeed.zeit.de/wissen/index'), + ('Seite 1', 'http://newsfeed.zeit.de/index'), + ('Politik', 'http://www.zeit.de/solr/select/?q=ressort:%22Politik%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Wirtschaft', 'http://www.zeit.de/solr/select/?q=ressort:%22Wirtschaft%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Meinung', 'http://www.zeit.de/solr/select/?q=ressort:%22Meinung%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Gesellschaft', 'http://www.zeit.de/solr/select/?q=ressort:%22Gesellschaft%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Kultur', 'http://www.zeit.de/solr/select/?q=ressort:%22Kultur%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Wissen', 'http://www.zeit.de/solr/select/?q=ressort:%22Wissen%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Digital', 'http://www.zeit.de/solr/select/?q=ressort:%22Digital%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Studium', 'http://www.zeit.de/solr/select/?q=ressort:%22Studium%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Karriere', 'http://www.zeit.de/solr/select/?q=ressort:%22Karriere%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Lebensart', 'http://www.zeit.de/solr/select/?q=ressort:%22Lebensart%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Reisen', 'http://www.zeit.de/solr/select/?q=ressort:%22Reisen%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Auto', 'http://www.zeit.de/solr/select/?q=ressort:%22Auto%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), + ('Sport', 'http://www.zeit.de/solr/select/?q=ressort:%22Sport%22%20type:article&version=2.2&start=0&rows=50&sort=date-first-released%20desc&indent=on&wt=xslt&tr=solr2rss.xsl'), ] extra_css = ''' .supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:large;} - .title{font-family:Arial,Helvetica,sans-serif;font-size:large} + .excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:small;} + .title{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;} .caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} .copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} .article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} + .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} + .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small} .headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small} + .inline{float:left;margin-top:0;margin-right:15px;position:relative;width:180px; } + img.inline{float:none} + .intertitle{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small;font-weight:700} + .ebinfobox{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small;list-style-type:none;float:right;margin-top:0;border-left-style:solid;border-left-width:1px;padding-left:10px;} + .infobox {border-style: solid; border-width: 1px;padding:8px;} + .infobox dt {font-weight:700;} ''' #filter_regexps = [r'ad.de.doubleclick.net/'] keep_only_tags = [ dict(name='div', attrs={'class':["article"]}) , + dict(name='ul', attrs={'class':["tools"]}) , ] remove_tags = [ - dict(name='link'), dict(name='iframe'),dict(name='style'), - dict(name='div', attrs={'class':["pagination block","pagenav","inline link"] }), - dict(name='div', attrs={'id':["place_5","place_4"]}) + dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'), + dict(name='div', attrs={'class':["pagination block","pagenav","inline link", "copyright"] }), + dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }), + dict(name='div', attrs={'id':["place_5","place_4","comments"]}) ] - + remove_attributes = ['style', 'font'] def get_article_url(self, article): + ans = article.get('link',None) + ans += "?page=all" - ans = article.get('guid',None) - - try: - self.log('Looking for full story link in', ans) - soup = self.index_to_soup(ans) - x = soup.find(text="Auf einer Seite lesen") - - if x is not None: - - a = x.parent - if a and a.has_key('href'): - ans = a['href'] - self.log('Found full story link', ans) - except: - pass - - if 'video' in ans or 'quiz' in ans : - + if 'video' in ans or 'quiz' or 'blog.zeit.de/' in ans : ans = None return ans - + def get_cover_url(self): + try: + inhalt = self.index_to_soup('http://www.zeit.de/inhalt') + return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','') + except: + return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg' def preprocess_html(self, soup): soup.html['xml:lang'] = self.lang soup.html['lang'] = self.lang mtag = '' soup.head.insert(0,mtag) - + title = soup.find('h2', attrs={'class':'title'}) + if title is None: + print "no title" + return soup + info = Tag(soup,'ul',[('class','ebinfobox')]) + tools = soup.find('ul', attrs={'class':'tools'}) + author = tools.find('li','author first') + for tag in ['author first', 'date', 'date first', 'author', 'source']: + line = tools.find('li', tag) + if line: + info.insert(0,line) + title.parent.insert(0,info) + tools.extract() return soup - - #def print_version(self,url): - # return url.replace('http://www.zeit.de/', 'http://images.zeit.de/text/').replace('?from=rss', '') -