diff --git a/recipes/deutsche_welle_de.recipe b/recipes/deutsche_welle_de.recipe index 2d988f7c86..4ee5bd9c67 100644 --- a/recipes/deutsche_welle_de.recipe +++ b/recipes/deutsche_welle_de.recipe @@ -1,21 +1,12 @@ -from calibre.web.feeds.news import BasicNewsRecipe -# History: -# 1: Base Version -# 2: Added rules for wdr.de, ndr.de, br-online.de -# 3: Added rules for rbb-online.de, boerse.ard.de, sportschau.de -# 4: New design of tagesschau.de implemented. Simplified. -# 5: Taken out the pictures. - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle(BasicNewsRecipe): title = 'Deutsche Welle' description = 'Nachrichten der Deutschen Welle (DW)' publisher = 'DW - info@dw.com' language = 'de' - version = 1 - cover_url = 'https://pbs.twimg.com/profile_images/900269457976823808/nkod9w_m_400x400.jpg' - __author__ = 'VoHe' - oldest_article = 3 + __author__ = 'unkn0wn' + oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True remove_javascript = True @@ -23,26 +14,32 @@ class DeutscheWelle(BasicNewsRecipe): remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] - remove_tags_before = dict(name='h4', attrs={'class':'artikel'}) - - remove_tags_after = dict(name='div', attrs={'class':'col1 dim'}) + keep_only_tags = [ + dict(name='article') + ] remove_tags = [ - dict(name='div', attrs={'class':'footerSection'}), - dict(name='div', attrs={'class':'sharing-bar'}), - dict(name='div', attrs={'class':'coll dim'}), - dict(name='div', attrs={'class':'languageSection'}), + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] + # watch out https://www.dw.com/de/service/rss/s-9773 for description of possible rss feeds feeds = [ - ('Thema des Tages', 'http://rss.dw.com/xml/rss-de-top'), - # ('Nachrichten', 'http://rss.dw.com/xml/rss-de-news'), + ('Nachrichten', 'http://rss.dw.com/xml/rss-de-news'), ('Wissenschaft', 'http://rss.dw.com/xml/rss-de-wissenschaft'), - # ('Sport', 'http://rss.dw.com/xml/rss-de-sport'), + ('Sport', 'http://rss.dw.com/xml/rss-de-sport'), ('Deuschland entdecken', 'http://rss.dw.com/xml/rss-de-deutschlandentdecken'), ('Presse', 'http://rss.dw.com/xml/presse'), ('Politik', 'http://rss.dw.com/xml/rss_de_politik'), ('Wirtschaft', 'http://rss.dw.com/xml/rss-de-eco'), ('Kultur und Leben', 'http://rss.dw.com/xml/rss-de-cul'), + ('Thema des Tages', 'http://rss.dw.com/xml/rss-de-top'), ] + + def preprocess_html(self, soup): + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] + return soup diff --git a/recipes/deutsche_welle_en.recipe b/recipes/deutsche_welle_en.recipe index 3cde7e7418..faa02a6183 100644 --- a/recipes/deutsche_welle_en.recipe +++ b/recipes/deutsche_welle_en.recipe @@ -1,34 +1,31 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -from __future__ import unicode_literals, division, absolute_import, print_function - -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' - -''' -Deutsche Welle (english) - dw.com/en -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_en(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'News from Germany and the world' publisher = 'Deutsche Welle' language = 'en' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 50 no_stylesheets = True remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} - + remove_attributes = ['height', 'width', 'style'] + + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] + feeds = [ - ('Top Stories', 'http://rss.dw-world.de/rdf/rss-en-top'), ('World', 'http://rss.dw.de/rdf/rss-en-world'), ('Germany', 'http://rss.dw.de/rdf/rss-en-ger'), ('Europe', 'http://rss.dw.de/rdf/rss-en-eu'), @@ -36,40 +33,11 @@ class DeutscheWelle_en(BasicNewsRecipe): ('Culture & Lifestyle', 'http://rss.dw.de/rdf/rss-en-cul'), ('Sports', 'http://rss.dw.de/rdf/rss-en-sports'), ('Visit Germany', 'http://rss.dw.de/rdf/rss-en-visitgermany'), - ('Asia', 'http://rss.dw.de/rdf/rss-en-asia') + ('Asia', 'http://rss.dw.de/rdf/rss-en-asia'), + ('Top Stories', 'http://rss.dw-world.de/rdf/rss-en-top'), ] - - keep_only_tags = [ - dict(name='div', attrs={'class': 'col3'}) - ] - - remove_tags_after = [ - dict(name='div', attrs={'class': 'group'}) - ] - - remove_tags = [ - dict(name='div', attrs={'class': 'col1'}), - dict(name='div', attrs={'class': re.compile('gallery')}), - dict(name='div', attrs={'class': re.compile('audio')}), - dict(name='div', attrs={'class': re.compile('video')}) - ] - - remove_attributes = ['height', 'width', - 'onclick', 'border', 'lang', 'link'] - - extra_css = ''' - h1 {font-size: 1.6em; margin-top: 0em} - .artikel {font-size: 1em; text-transform: uppercase; margin: 0em} - ''' - + def preprocess_html(self, soup): - # convert local hyperlinks - for a in soup.findAll('a', href=True): - if a['href'].startswith('/'): - a['href'] = 'http://www.dw.com' + a['href'] - elif a['href'].startswith('#'): - del a['href'] - # remove all style attributes with an effect on font size - for item in soup.findAll(attrs={'style': re.compile('font-size')}): - del item['style'] + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup