diff --git a/recipes/deutsche_welle_en.recipe b/recipes/deutsche_welle_en.recipe index fb41bdeead..e0ec0c39ac 100644 --- a/recipes/deutsche_welle_en.recipe +++ b/recipes/deutsche_welle_en.recipe @@ -1,66 +1,73 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import unicode_literals, division, absolute_import, print_function + __license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' + ''' -dw-world.de +Deutsche Welle (english) - dw.com/en ''' +import re from calibre.web.feeds.news import BasicNewsRecipe class DeutscheWelle_en(BasicNewsRecipe): - title = 'Deutsche Welle' - __author__ = 'Darko Miletic' - description = 'News from Germany and World' - publisher = 'Deutsche Welle' - category = 'news, politics, Germany' - oldest_article = 1 - max_articles_per_feed = 100 - use_embedded_content = False - no_stylesheets = True - language = 'en' - publication_type = 'newsportal' - remove_empty_feeds = True - masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - body{font-family: Arial,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ + title = 'Deutsche Welle' + __author__ = 'Darko Miletic' + description = 'News from Germany and the world' + publisher = 'Deutsche Welle' + language = 'en' + oldest_article = 1 + max_articles_per_feed = 50 + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - } + feeds = [ + ('Top Stories', 'http://rss.dw-world.de/rdf/rss-en-top'), + ('World', 'http://rss.dw.de/rdf/rss-en-world'), + ('Germany', 'http://rss.dw.de/rdf/rss-en-ger'), + ('Europe', 'http://rss.dw.de/rdf/rss-en-eu'), + ('Business', 'http://rss.dw.de/rdf/rss-en-bus'), + ('Culture & Lifestyle', 'http://rss.dw.de/rdf/rss-en-cul'), + ('Sports', 'http://rss.dw.de/rdf/rss-en-sports'), + ('Visit Germany', 'http://rss.dw.de/rdf/rss-en-visitgermany'), + ('Asia', 'http://rss.dw.de/rdf/rss-en-asia') + ] + + keep_only_tags=[ + dict(name='div', attrs={'class':'col3'}) + ] + + remove_tags_after = [ + dict(name='div', attrs={'class':'group'}) + ] remove_tags = [ - dict(name=['iframe','embed','object','form','base','meta','link']) - ,dict(attrs={'class':'actionFooter'}) - ] - keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})] - remove_attributes = ['height','width','onclick','border','lang'] + dict(name='div', attrs={'class':'col1'}), + dict(name='div', attrs={'class':re.compile('gallery')}), + dict(name='div', attrs={'class':re.compile('audio')}), + dict(name='div', attrs={'class':re.compile('video')}) + ] - feeds = [(u'All news', u'http://rss.dw-world.de/rdf/rss-en-all')] + remove_attributes = ['height', 'width', 'onclick', 'border', 'lang', 'link'] - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl + extra_css = ''' + h1 {font-size: 1.6em; margin-top: 0em} + .artikel {font-size: 1em; text-transform: uppercase; margin: 0em} + ''' def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - if item.has_key('target'): - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + # convert local hyperlinks + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://www.dw.com' + a['href'] + elif a['href'].startswith('#'): + del a['href'] + # remove all style attributes with an effect on font size + for item in soup.findAll(attrs={'style':re.compile('font-size')}): + del item['style'] return soup - diff --git a/recipes/deutsche_welle_es.recipe b/recipes/deutsche_welle_es.recipe index 7cf58b0a55..f2450509c6 100644 --- a/recipes/deutsche_welle_es.recipe +++ b/recipes/deutsche_welle_es.recipe @@ -1,66 +1,74 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import unicode_literals, division, absolute_import, print_function + __license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' + ''' -dw-world.de +Deutsche Welle (español) - dw.com/es ''' +import re from calibre.web.feeds.news import BasicNewsRecipe class DeutscheWelle_es(BasicNewsRecipe): - title = 'Deutsche Welle' - __author__ = 'Darko Miletic' - description = 'Noticias desde Alemania y mundo' - publisher = 'Deutsche Welle' - category = 'news, politics, Germany' - oldest_article = 1 - max_articles_per_feed = 100 - use_embedded_content = False - no_stylesheets = True - language = 'de' - publication_type = 'newsportal' - remove_empty_feeds = True - masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - body{font-family: Arial,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ + title = 'Deutsche Welle' + __author__ = 'Darko Miletic' + description = 'Noticias desde Alemania y mundo' + publisher = 'Deutsche Welle' + language = 'es' + oldest_article = 2 + max_articles_per_feed = 50 + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - } + feeds = [ + ('Titulares', 'http://rss.dw-world.de/rdf/rss-sp-top'), + ('Noticias de Alemania', 'http://rss.dw-world.de/rdf/rss-sp-ale'), + ('Internacionales', 'http://rss.dw-world.de/rdf/rss-sp-inter'), + ('Cultura', 'http://rss.dw-world.de/rdf/rss-sp-cul'), + ('Ciencia y Tecnología', 'http://rss.dw-world.de/rdf/rss-sp-cyt'), + ('Economía', 'http://rss.dw-world.de/rdf/rss-sp-eco'), + ('La prensa opina', 'http://rss.dw-world.de/rdf/rss-sp-press'), + ('Ecología', 'http://rss.dw-world.de/rdf/rss-sp-ecol'), + ('Futbol alemán', 'http://rss.dw-world.de/rdf/rss-sp-fut'), + ('Conozca Alemania', 'http://rss.dw-world.de/rdf/rss-sp-con') + ] + + keep_only_tags=[ + dict(name='div', attrs={'class':'col3'}) + ] + + remove_tags_after = [ + dict(name='div', attrs={'class':'group'}) + ] remove_tags = [ - dict(name=['iframe','embed','object','form','base','meta','link']) - ,dict(attrs={'class':'actionFooter'}) - ] - keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})] - remove_attributes = ['height','width','onclick','border','lang'] + dict(name='div', attrs={'class':'col1'}), + dict(name='div', attrs={'class':re.compile('gallery')}), + dict(name='div', attrs={'class':re.compile('audio')}), + dict(name='div', attrs={'class':re.compile('video')}) + ] - feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-sp-all')] + remove_attributes = ['height', 'width', 'onclick', 'border', 'lang', 'link'] - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl + extra_css = ''' + h1 {font-size: 1.6em; margin-top: 0em} + .artikel {font-size: 1em; text-transform: uppercase; margin: 0em} + ''' def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - if item.has_key('target'): - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + # convert local hyperlinks + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://www.dw.com' + a['href'] + elif a['href'].startswith('#'): + del a['href'] + # remove all style attributes with an effect on font size + for item in soup.findAll(attrs={'style':re.compile('font-size')}): + del item['style'] return soup -