From 2a551e3ab351b31ee41b901292ac5d229f0a2948 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 2 Nov 2010 11:40:09 -0600 Subject: [PATCH 1/2] Fix #7389 (Updated Recipe: ZEIT ONLINE) --- resources/recipes/zeitde.recipe | 81 +++++++-------------------------- 1 file changed, 17 insertions(+), 64 deletions(-) diff --git a/resources/recipes/zeitde.recipe b/resources/recipes/zeitde.recipe index 7f2ca0f6b2..35835e0e6d 100644 --- a/resources/recipes/zeitde.recipe +++ b/resources/recipes/zeitde.recipe @@ -6,22 +6,25 @@ Fetch Die Zeit. ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class ZeitDe(BasicNewsRecipe): - title = 'ZEIT Online' - description = 'ZEIT Online' + title = 'Zeit Online' + description = 'Zeit Online' language = 'de' - lang = 'de_DE' - __author__ = 'Martin Pitt, Sujata Raman and Ingo Paschke' - use_embedded_content = False + __author__ = 'Martin Pitt, Sujata Raman, Ingo Paschke and Marc Toensing' + max_articles_per_feed = 40 - remove_empty_feeds = True - no_stylesheets = True - no_javascript = True - encoding = 'utf-8' + + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':["response","pagination block","pagenav","inline link", "copyright"] }), + dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }), + dict(name='div', attrs={'id':["place_5","place_4","comments"]}) + ] + + keep_only_tags = [dict(id=['main'])] feeds = [ ('Seite 1', 'http://newsfeed.zeit.de/index_xml'), @@ -40,43 +43,15 @@ class ZeitDe(BasicNewsRecipe): ('Sport', 'http://newsfeed.zeit.de/sport/index'), ] - extra_css = ''' - .supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:small;} - .title{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;} - .caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} - .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} - .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small} - .headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small} - .inline{float:left;margin-top:0;margin-right:15px;position:relative;width:180px; } - img.inline{float:none} - .intertitle{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small;font-weight:700} - .ebinfobox{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small;list-style-type:none;float:right;margin-top:0;border-left-style:solid;border-left-width:1px;padding-left:10px;} - .infobox {border-style: solid; border-width: 1px;padding:8px;} - .infobox dt {font-weight:700;} - ''' + extra_css = '.reaktion,.taglist,.comments,.reponse,.responsetitle,.responsebody,.reponse,.inline,.date{display:none;}li.date{display:block}' + #filter_regexps = [r'ad.de.doubleclick.net/'] - keep_only_tags = [ - dict(name='div', attrs={'class':["article"]}) , - dict(name='ul', attrs={'class':["tools"]}) , - ] - remove_tags = [ - dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'), - dict(name='div', attrs={'class':["pagination block","pagenav","inline link", "copyright"] }), - dict(name='p', attrs={'class':["ressortbacklink", "copyright"] }), - dict(name='div', attrs={'id':["place_5","place_4","comments"]}) - ] - - remove_attributes = ['style', 'font'] - def get_article_url(self, article): ans = article.get('link',None) - ans += "?page=all" + ans += "?page=all&print=true" - if 'video' in ans or 'quiz' in ans : + if 'video' in ans or 'quiz' in ans or 'blog' in ans : ans = None return ans @@ -86,25 +61,3 @@ class ZeitDe(BasicNewsRecipe): return inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','') except: return 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg' - - def preprocess_html(self, soup): - soup.html['xml:lang'] = self.lang - soup.html['lang'] = self.lang - mtag = '' - soup.head.insert(0,mtag) - title = soup.find('h2', attrs={'class':'title'}) - if title is None: - print "no title" - return soup - info = Tag(soup,'ul',[('class','ebinfobox')]) - tools = soup.find('ul', attrs={'class':'tools'}) - #author = tools.find('li','author first') - for tag in ['author first', 'date', 'date first', 'author', 'source']: - line = tools.find('li', tag) - if line: - info.insert(0,line) - title.parent.insert(0,info) - tools.extract() - return soup - - From 3450d1ad94f79796214705ec6b173e2c400d64f6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 2 Nov 2010 11:42:29 -0600 Subject: [PATCH 2/2] Fix Fudzilla --- resources/recipes/fudzilla.recipe | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/resources/recipes/fudzilla.recipe b/resources/recipes/fudzilla.recipe index 821488ad0a..b47b4d4cab 100644 --- a/resources/recipes/fudzilla.recipe +++ b/resources/recipes/fudzilla.recipe @@ -25,15 +25,15 @@ class Fudzilla(BasicNewsRecipe): remove_tags_before = dict(name='div', attrs={'class':['padding']}) remove_tags = [dict(name='td', attrs={'class':['left','right']}), - dict(name='div', attrs={'id':['toolbar','buttons']}), - dict(name='div', attrs={'class':['artbannersxtd','back_button']}), - dict(name='span', attrs={'class':['pathway']}), - dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), - dict(name='table', attrs={'class':['headlines']}), + dict(name='div', attrs={'id':['toolbar','buttons']}), + dict(name='div', attrs={'class':['artbannersxtd','back_button']}), + dict(name='span', attrs={'class':['pathway']}), + dict(name='th', attrs={'class':['pagenav_next','pagenav_prev']}), + dict(name='table', attrs={'class':['headlines']}), ] feeds = [ - (u'Posts', u'http://www.fudzilla.com/index.php?option=com_rss&feed=RSS2.0&no_html=1') + (u'Posts', u'http://www.fudzilla.com/?format=feed') ] preprocess_regexps = [