diff --git a/recipes/deutsche_welle_bs.recipe b/recipes/deutsche_welle_bs.recipe index 0b852c94c8..65ccd31cb9 100644 --- a/recipes/deutsche_welle_bs.recipe +++ b/recipes/deutsche_welle_bs.recipe @@ -1,73 +1,44 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_bs(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Vijesti iz Njemacke i svijeta' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True language = 'bs' publication_type = 'newsportal' remove_empty_feeds = True + remove_javascript = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - body{font-family: Arial,sans1,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - + + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + keep_only_tags = [ + dict(name='article') + ] + remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] - + feeds = [ - - (u'Politika', u'http://rss.dw-world.de/rdf/rss-bos-pol'), - (u'Evropa', u'http://rss.dw-world.de/rdf/rss-bos-eu'), - (u'Kiosk', u'http://rss.dw-world.de/rdf/rss-bos-eu'), - (u'Ekonomija i Nuka', u'http://rss.dw-world.de/rdf/rss-bos-eco'), - (u'Kultura', u'http://rss.dw-world.de/rdf/rss-bos-cul'), - (u'Sport', u'http://rss.dw-world.de/rdf/rss-bos-sp') + (u'Politika', u'http://rss.dw-world.de/rdf/rss-bos-pol'), + (u'Evropa', u'http://rss.dw-world.de/rdf/rss-bos-eu'), + (u'Kiosk', u'http://rss.dw-world.de/rdf/rss-bos-eu'), + (u'Ekonomija i Nuka', u'http://rss.dw-world.de/rdf/rss-bos-eco'), + (u'Kultura', u'http://rss.dw-world.de/rdf/rss-bos-cul'), + (u'Sport', u'http://rss.dw-world.de/rdf/rss-bos-sp') ] - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup diff --git a/recipes/deutsche_welle_de.recipe b/recipes/deutsche_welle_de.recipe index 2d988f7c86..4ee5bd9c67 100644 --- a/recipes/deutsche_welle_de.recipe +++ b/recipes/deutsche_welle_de.recipe @@ -1,21 +1,12 @@ -from calibre.web.feeds.news import BasicNewsRecipe -# History: -# 1: Base Version -# 2: Added rules for wdr.de, ndr.de, br-online.de -# 3: Added rules for rbb-online.de, boerse.ard.de, sportschau.de -# 4: New design of tagesschau.de implemented. Simplified. -# 5: Taken out the pictures. - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle(BasicNewsRecipe): title = 'Deutsche Welle' description = 'Nachrichten der Deutschen Welle (DW)' publisher = 'DW - info@dw.com' language = 'de' - version = 1 - cover_url = 'https://pbs.twimg.com/profile_images/900269457976823808/nkod9w_m_400x400.jpg' - __author__ = 'VoHe' - oldest_article = 3 + __author__ = 'unkn0wn' + oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True remove_javascript = True @@ -23,26 +14,32 @@ class DeutscheWelle(BasicNewsRecipe): remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] - remove_tags_before = dict(name='h4', attrs={'class':'artikel'}) - - remove_tags_after = dict(name='div', attrs={'class':'col1 dim'}) + keep_only_tags = [ + dict(name='article') + ] remove_tags = [ - dict(name='div', attrs={'class':'footerSection'}), - dict(name='div', attrs={'class':'sharing-bar'}), - dict(name='div', attrs={'class':'coll dim'}), - dict(name='div', attrs={'class':'languageSection'}), + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] + # watch out https://www.dw.com/de/service/rss/s-9773 for description of possible rss feeds feeds = [ - ('Thema des Tages', 'http://rss.dw.com/xml/rss-de-top'), - # ('Nachrichten', 'http://rss.dw.com/xml/rss-de-news'), + ('Nachrichten', 'http://rss.dw.com/xml/rss-de-news'), ('Wissenschaft', 'http://rss.dw.com/xml/rss-de-wissenschaft'), - # ('Sport', 'http://rss.dw.com/xml/rss-de-sport'), + ('Sport', 'http://rss.dw.com/xml/rss-de-sport'), ('Deuschland entdecken', 'http://rss.dw.com/xml/rss-de-deutschlandentdecken'), ('Presse', 'http://rss.dw.com/xml/presse'), ('Politik', 'http://rss.dw.com/xml/rss_de_politik'), ('Wirtschaft', 'http://rss.dw.com/xml/rss-de-eco'), ('Kultur und Leben', 'http://rss.dw.com/xml/rss-de-cul'), + ('Thema des Tages', 'http://rss.dw.com/xml/rss-de-top'), ] + + def preprocess_html(self, soup): + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] + return soup diff --git a/recipes/deutsche_welle_en.recipe b/recipes/deutsche_welle_en.recipe index 3cde7e7418..faa02a6183 100644 --- a/recipes/deutsche_welle_en.recipe +++ b/recipes/deutsche_welle_en.recipe @@ -1,34 +1,31 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -from __future__ import unicode_literals, division, absolute_import, print_function - -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' - -''' -Deutsche Welle (english) - dw.com/en -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_en(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'News from Germany and the world' publisher = 'Deutsche Welle' language = 'en' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 50 no_stylesheets = True remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} - + remove_attributes = ['height', 'width', 'style'] + + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] + feeds = [ - ('Top Stories', 'http://rss.dw-world.de/rdf/rss-en-top'), ('World', 'http://rss.dw.de/rdf/rss-en-world'), ('Germany', 'http://rss.dw.de/rdf/rss-en-ger'), ('Europe', 'http://rss.dw.de/rdf/rss-en-eu'), @@ -36,40 +33,11 @@ class DeutscheWelle_en(BasicNewsRecipe): ('Culture & Lifestyle', 'http://rss.dw.de/rdf/rss-en-cul'), ('Sports', 'http://rss.dw.de/rdf/rss-en-sports'), ('Visit Germany', 'http://rss.dw.de/rdf/rss-en-visitgermany'), - ('Asia', 'http://rss.dw.de/rdf/rss-en-asia') + ('Asia', 'http://rss.dw.de/rdf/rss-en-asia'), + ('Top Stories', 'http://rss.dw-world.de/rdf/rss-en-top'), ] - - keep_only_tags = [ - dict(name='div', attrs={'class': 'col3'}) - ] - - remove_tags_after = [ - dict(name='div', attrs={'class': 'group'}) - ] - - remove_tags = [ - dict(name='div', attrs={'class': 'col1'}), - dict(name='div', attrs={'class': re.compile('gallery')}), - dict(name='div', attrs={'class': re.compile('audio')}), - dict(name='div', attrs={'class': re.compile('video')}) - ] - - remove_attributes = ['height', 'width', - 'onclick', 'border', 'lang', 'link'] - - extra_css = ''' - h1 {font-size: 1.6em; margin-top: 0em} - .artikel {font-size: 1em; text-transform: uppercase; margin: 0em} - ''' - + def preprocess_html(self, soup): - # convert local hyperlinks - for a in soup.findAll('a', href=True): - if a['href'].startswith('/'): - a['href'] = 'http://www.dw.com' + a['href'] - elif a['href'].startswith('#'): - del a['href'] - # remove all style attributes with an effect on font size - for item in soup.findAll(attrs={'style': re.compile('font-size')}): - del item['style'] + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup diff --git a/recipes/deutsche_welle_es.recipe b/recipes/deutsche_welle_es.recipe index 1300fea96d..a036b2a96b 100644 --- a/recipes/deutsche_welle_es.recipe +++ b/recipes/deutsche_welle_es.recipe @@ -1,21 +1,8 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -from __future__ import unicode_literals, division, absolute_import, print_function - -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' - -''' -Deutsche Welle (espaƱol) - dw.com/es -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_es(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Noticias desde Alemania y mundo' publisher = 'Deutsche Welle' language = 'es' @@ -27,6 +14,18 @@ class DeutscheWelle_es(BasicNewsRecipe): remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] + feeds = [ ('Titulares', 'http://rss.dw-world.de/rdf/rss-sp-top'), ('Noticias de Alemania', 'http://rss.dw-world.de/rdf/rss-sp-ale'), @@ -40,37 +39,8 @@ class DeutscheWelle_es(BasicNewsRecipe): ('Conozca Alemania', 'http://rss.dw-world.de/rdf/rss-sp-con') ] - keep_only_tags = [ - dict(name='div', attrs={'class': 'col3'}) - ] - - remove_tags_after = [ - dict(name='div', attrs={'class': 'group'}) - ] - - remove_tags = [ - dict(name='div', attrs={'class': 'col1'}), - dict(name='div', attrs={'class': re.compile('gallery')}), - dict(name='div', attrs={'class': re.compile('audio')}), - dict(name='div', attrs={'class': re.compile('video')}) - ] - - remove_attributes = ['height', 'width', - 'onclick', 'border', 'lang', 'link'] - - extra_css = ''' - h1 {font-size: 1.6em; margin-top: 0em} - .artikel {font-size: 1em; text-transform: uppercase; margin: 0em} - ''' - + def preprocess_html(self, soup): - # convert local hyperlinks - for a in soup.findAll('a', href=True): - if a['href'].startswith('/'): - a['href'] = 'http://www.dw.com' + a['href'] - elif a['href'].startswith('#'): - del a['href'] - # remove all style attributes with an effect on font size - for item in soup.findAll(attrs={'style': re.compile('font-size')}): - del item['style'] + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup diff --git a/recipes/deutsche_welle_hr.recipe b/recipes/deutsche_welle_hr.recipe index 906e4a1d39..73264fc635 100644 --- a/recipes/deutsche_welle_hr.recipe +++ b/recipes/deutsche_welle_hr.recipe @@ -1,20 +1,12 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_hr(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Vesti iz Njemacke i svijeta' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True @@ -22,50 +14,29 @@ class DeutscheWelle_hr(BasicNewsRecipe): publication_type = 'newsportal' remove_empty_feeds = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - body{font-family: Arial,sans1,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } + remove_javascript = True + + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + keep_only_tags = [ + dict(name='article') + ] + remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] - + feeds = [ - - (u'Svijet', u'http://rss.dw-world.de/rdf/rss-cro-svijet'), - (u'Europa', u'http://rss.dw-world.de/rdf/rss-cro-eu'), - (u'Njemacka', u'http://rss.dw-world.de/rdf/rss-cro-ger'), - (u'Vijesti', u'http://rss.dw-world.de/rdf/rss-cro-all') + (u'Svijet', u'http://rss.dw-world.de/rdf/rss-cro-svijet'), + (u'Europa', u'http://rss.dw-world.de/rdf/rss-cro-eu'), + (u'Njemacka', u'http://rss.dw-world.de/rdf/rss-cro-ger'), + (u'Vijesti', u'http://rss.dw-world.de/rdf/rss-cro-all') ] - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup diff --git a/recipes/deutsche_welle_pt.recipe b/recipes/deutsche_welle_pt.recipe index 4b9a9ea9dc..aff42efd0a 100644 --- a/recipes/deutsche_welle_pt.recipe +++ b/recipes/deutsche_welle_pt.recipe @@ -1,19 +1,12 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_pt(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Noticias desde Alemania y mundo' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True @@ -21,42 +14,25 @@ class DeutscheWelle_pt(BasicNewsRecipe): publication_type = 'newsportal' remove_empty_feeds = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - body{font-family: Arial,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ + + + remove_javascript = True + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + def preprocess_html(self, soup): + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] + return soup - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-br-all')] - - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - - def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) - return soup diff --git a/recipes/deutsche_welle_ru.recipe b/recipes/deutsche_welle_ru.recipe index 0fbbcc6327..ec4e838af0 100644 --- a/recipes/deutsche_welle_ru.recipe +++ b/recipes/deutsche_welle_ru.recipe @@ -1,24 +1,36 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 - -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle(BasicNewsRecipe): title = u'Deutsche Welle \u043D\u0430 \u0440\u0443\u0441\u0441\u043A\u043E\u043C' description = u'\u0420\u0443\u0441\u0441\u043A\u0430\u044F \u0440\u0435\u0434\u0430\u043A\u0446\u0438\u044F Deutsche Welle: \u043D\u043E\u0432\u043E\u0441\u0442\u0438, \u0430\u043D\u0430\u043B\u0438\u0442\u0438\u043A\u0430, \u043A\u043E\u043C\u043C\u0435\u043D\u0442\u0430\u0440\u0438\u0438 \u0438 \u0440\u0435\u043F\u043E\u0440\u0442\u0430\u0436\u0438 \u0438\u0437 \u0413\u0435\u0440\u043C\u0430\u043D\u0438\u0438 \u0438 \u0415\u0432\u0440\u043E\u043F\u044B, \u043D\u0435\u043C\u0435\u0446\u043A\u0438\u0439 \u0438 \u0435\u0432\u0440\u043E\u043F\u0435\u0439\u0441\u043A\u0438\u0439 \u0432\u0437\u0433\u043B\u044F\u0434 \u043D\u0430 \u0441\u043E\u0431\u044B\u0442\u0438\u044F \u0432 \u0420\u043E\u0441\u0441\u0438\u0438 \u0438 \u043C\u0438\u0440\u0435, \u043F\u0440\u0430\u043A\u0442\u0438\u0447\u0435\u0441\u043A\u0438\u0435 \u0441\u043E\u0432\u0435\u0442\u044B \u0434\u043B\u044F \u0442\u0443\u0440\u0438\u0441\u0442\u043E\u0432 \u0438 \u0442\u0435\u0445, \u043A\u0442\u043E \u0436\u0435\u043B\u0430\u0435\u0442 \u0443\u0447\u0438\u0442\u044C\u0441\u044F \u0438\u043B\u0438 \u0440\u0430\u0431\u043E\u0442\u0430\u0442\u044C \u0432 \u0413\u0435\u0440\u043C\u0430\u043D\u0438\u0438 \u0438 \u0434\u0440\u0443\u0433\u0438\u0445 \u0441\u0442\u0440\u0430\u043D\u0430\u0445 \u0415\u0432\u0440\u043E\u0441\u043E\u044E\u0437\u0430.' # noqa - __author__ = 'bugmen00t' + __author__ = 'bugmen00t, unkn0wn' publication_type = 'newspaper' - oldest_article = 14 + oldest_article = 2 max_articles_per_feed = 100 language = 'ru' - cover_url = 'https://www.dw.com/cssi/dwlogo-print.gif' - auto_cleanup = False - no_stylesheets = False + # cover_url = 'https://www.dw.com/cssi/dwlogo-print.gif' - remove_tags_before = dict(name='h1') + remove_javascript = True + no_stylesheets = True + remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + def preprocess_html(self, soup): + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] + return soup - remove_tags_after = dict(name='div', attrs={'class': 'longText'}) + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] feeds = [ ( diff --git a/recipes/deutsche_welle_sr.recipe b/recipes/deutsche_welle_sr.recipe index b9c67e4976..7f1b1717c0 100644 --- a/recipes/deutsche_welle_sr.recipe +++ b/recipes/deutsche_welle_sr.recipe @@ -1,20 +1,12 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_sr(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Vesti iz Nemacke i sveta' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True @@ -22,55 +14,34 @@ class DeutscheWelle_sr(BasicNewsRecipe): publication_type = 'newsportal' remove_empty_feeds = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - body{font-family: Arial,sans1,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) - ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] - - feeds = [ - - (u'Politika', u'http://rss.dw-world.de/rdf/rss-ser-pol'), - (u'Srbija', u'http://rss.dw-world.de/rdf/rss-ser-pol-ser'), - (u'Region', u'http://rss.dw-world.de/rdf/rss-ser-pol-region'), - (u'Evropa', u'http://rss.dw-world.de/rdf/rss-ser-pol-eu'), - (u'Nemacka', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), - (u'Svet', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), - (u'Pregled stampe', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), - (u'Nauka Tehnika Medicina', u'http://rss.dw-world.de/rdf/rss-ser-science'), - (u'Kultura', u'feed:http://rss.dw-world.de/rdf/rss-ser-cul') - ] - - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - + remove_javascript = True + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup + + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] + + feeds = [ + (u'Politika', u'http://rss.dw-world.de/rdf/rss-ser-pol'), + (u'Srbija', u'http://rss.dw-world.de/rdf/rss-ser-pol-ser'), + (u'Region', u'http://rss.dw-world.de/rdf/rss-ser-pol-region'), + (u'Evropa', u'http://rss.dw-world.de/rdf/rss-ser-pol-eu'), + (u'Nemacka', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), + (u'Svet', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), + (u'Pregled stampe', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), + (u'Nauka Tehnika Medicina', u'http://rss.dw-world.de/rdf/rss-ser-science'), + (u'Kultura', u'feed:http://rss.dw-world.de/rdf/rss-ser-cul') + ] + diff --git a/recipes/horizons.recipe b/recipes/horizons.recipe index bae979a043..26b48b69b9 100644 --- a/recipes/horizons.recipe +++ b/recipes/horizons.recipe @@ -27,6 +27,9 @@ class horizons(BasicNewsRecipe): classes('back-link'), dict(name='div', attrs={'class':'single-post-footer'}) ] + + def get_browser(self): + return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) def parse_index(self): soup = self.index_to_soup('https://www.cirsd.org/en/horizons') diff --git a/recipes/icons/deutsche_welle_bs.png b/recipes/icons/deutsche_welle_bs.png index 034074e83d..c0c5b8c0eb 100644 Binary files a/recipes/icons/deutsche_welle_bs.png and b/recipes/icons/deutsche_welle_bs.png differ diff --git a/recipes/icons/deutsche_welle_de.png b/recipes/icons/deutsche_welle_de.png index 0cea86328c..c0c5b8c0eb 100644 Binary files a/recipes/icons/deutsche_welle_de.png and b/recipes/icons/deutsche_welle_de.png differ diff --git a/recipes/icons/deutsche_welle_en.png b/recipes/icons/deutsche_welle_en.png index 034074e83d..c0c5b8c0eb 100644 Binary files a/recipes/icons/deutsche_welle_en.png and b/recipes/icons/deutsche_welle_en.png differ diff --git a/recipes/icons/deutsche_welle_es.png b/recipes/icons/deutsche_welle_es.png index 034074e83d..c0c5b8c0eb 100644 Binary files a/recipes/icons/deutsche_welle_es.png and b/recipes/icons/deutsche_welle_es.png differ diff --git a/recipes/icons/deutsche_welle_hr.png b/recipes/icons/deutsche_welle_hr.png index 034074e83d..c0c5b8c0eb 100644 Binary files a/recipes/icons/deutsche_welle_hr.png and b/recipes/icons/deutsche_welle_hr.png differ diff --git a/recipes/icons/deutsche_welle_pt.png b/recipes/icons/deutsche_welle_pt.png index 034074e83d..c0c5b8c0eb 100644 Binary files a/recipes/icons/deutsche_welle_pt.png and b/recipes/icons/deutsche_welle_pt.png differ diff --git a/recipes/icons/deutsche_welle_sr.png b/recipes/icons/deutsche_welle_sr.png index 034074e83d..c0c5b8c0eb 100644 Binary files a/recipes/icons/deutsche_welle_sr.png and b/recipes/icons/deutsche_welle_sr.png differ diff --git a/recipes/icons/horizons.png b/recipes/icons/horizons.png new file mode 100644 index 0000000000..5cf16612ac Binary files /dev/null and b/recipes/icons/horizons.png differ diff --git a/recipes/indian_express.recipe b/recipes/indian_express.recipe index efbd434694..36cf1da7aa 100644 --- a/recipes/indian_express.recipe +++ b/recipes/indian_express.recipe @@ -99,7 +99,7 @@ class IndianExpress(BasicNewsRecipe): div = soup.find('div', attrs={'class':['nation', 'o-opin']}) for art in div.findAll(attrs={'class':['articles', 'o-opin-article']}): for a in art.findAll('a', href=True): - if not a.find('img') and '/profile/' not in a['href']: + if not a.find('img') and not ('/profile/' in a['href'] or '/agency/' in a['href']): url = a['href'] title = self.tag_to_string(a) desc = '' diff --git a/recipes/psych.recipe b/recipes/psych.recipe index 235489565c..757621f108 100644 --- a/recipes/psych.recipe +++ b/recipes/psych.recipe @@ -43,8 +43,8 @@ class PsychologyToday(BasicNewsRecipe): soup = self.index_to_soup(absurl(a['href'])) articles = [] for article in soup.findAll('div', attrs={'class':'article-text'}): - title = self.tag_to_string(article.find(['h2','h3'])).strip() - url = absurl(article.find(['h2','h3']).a['href']) + title = self.tag_to_string(article.find(attrs={'class':['h2','h3']})).strip() + url = absurl(article.find(attrs={'class':['h2','h3']}).a['href']) self.log('\n', title, 'at', url) desc = self.tag_to_string(article.find('p',**classes('description'))).strip() author = self.tag_to_string(article.find('p',**classes('byline')).a).strip()