diff --git a/recipes/deutsche_welle_bs.recipe b/recipes/deutsche_welle_bs.recipe index 0b852c94c8..65ccd31cb9 100644 --- a/recipes/deutsche_welle_bs.recipe +++ b/recipes/deutsche_welle_bs.recipe @@ -1,73 +1,44 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_bs(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Vijesti iz Njemacke i svijeta' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True language = 'bs' publication_type = 'newsportal' remove_empty_feeds = True + remove_javascript = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - body{font-family: Arial,sans1,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - + + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + keep_only_tags = [ + dict(name='article') + ] + remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] - + feeds = [ - - (u'Politika', u'http://rss.dw-world.de/rdf/rss-bos-pol'), - (u'Evropa', u'http://rss.dw-world.de/rdf/rss-bos-eu'), - (u'Kiosk', u'http://rss.dw-world.de/rdf/rss-bos-eu'), - (u'Ekonomija i Nuka', u'http://rss.dw-world.de/rdf/rss-bos-eco'), - (u'Kultura', u'http://rss.dw-world.de/rdf/rss-bos-cul'), - (u'Sport', u'http://rss.dw-world.de/rdf/rss-bos-sp') + (u'Politika', u'http://rss.dw-world.de/rdf/rss-bos-pol'), + (u'Evropa', u'http://rss.dw-world.de/rdf/rss-bos-eu'), + (u'Kiosk', u'http://rss.dw-world.de/rdf/rss-bos-eu'), + (u'Ekonomija i Nuka', u'http://rss.dw-world.de/rdf/rss-bos-eco'), + (u'Kultura', u'http://rss.dw-world.de/rdf/rss-bos-cul'), + (u'Sport', u'http://rss.dw-world.de/rdf/rss-bos-sp') ] - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup diff --git a/recipes/deutsche_welle_es.recipe b/recipes/deutsche_welle_es.recipe index 1300fea96d..a036b2a96b 100644 --- a/recipes/deutsche_welle_es.recipe +++ b/recipes/deutsche_welle_es.recipe @@ -1,21 +1,8 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -from __future__ import unicode_literals, division, absolute_import, print_function - -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' - -''' -Deutsche Welle (espaƱol) - dw.com/es -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_es(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Noticias desde Alemania y mundo' publisher = 'Deutsche Welle' language = 'es' @@ -27,6 +14,18 @@ class DeutscheWelle_es(BasicNewsRecipe): remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] + feeds = [ ('Titulares', 'http://rss.dw-world.de/rdf/rss-sp-top'), ('Noticias de Alemania', 'http://rss.dw-world.de/rdf/rss-sp-ale'), @@ -40,37 +39,8 @@ class DeutscheWelle_es(BasicNewsRecipe): ('Conozca Alemania', 'http://rss.dw-world.de/rdf/rss-sp-con') ] - keep_only_tags = [ - dict(name='div', attrs={'class': 'col3'}) - ] - - remove_tags_after = [ - dict(name='div', attrs={'class': 'group'}) - ] - - remove_tags = [ - dict(name='div', attrs={'class': 'col1'}), - dict(name='div', attrs={'class': re.compile('gallery')}), - dict(name='div', attrs={'class': re.compile('audio')}), - dict(name='div', attrs={'class': re.compile('video')}) - ] - - remove_attributes = ['height', 'width', - 'onclick', 'border', 'lang', 'link'] - - extra_css = ''' - h1 {font-size: 1.6em; margin-top: 0em} - .artikel {font-size: 1em; text-transform: uppercase; margin: 0em} - ''' - + def preprocess_html(self, soup): - # convert local hyperlinks - for a in soup.findAll('a', href=True): - if a['href'].startswith('/'): - a['href'] = 'http://www.dw.com' + a['href'] - elif a['href'].startswith('#'): - del a['href'] - # remove all style attributes with an effect on font size - for item in soup.findAll(attrs={'style': re.compile('font-size')}): - del item['style'] + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup diff --git a/recipes/deutsche_welle_hr.recipe b/recipes/deutsche_welle_hr.recipe index 906e4a1d39..73264fc635 100644 --- a/recipes/deutsche_welle_hr.recipe +++ b/recipes/deutsche_welle_hr.recipe @@ -1,20 +1,12 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_hr(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Vesti iz Njemacke i svijeta' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True @@ -22,50 +14,29 @@ class DeutscheWelle_hr(BasicNewsRecipe): publication_type = 'newsportal' remove_empty_feeds = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - body{font-family: Arial,sans1,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } + remove_javascript = True + + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + keep_only_tags = [ + dict(name='article') + ] + remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] - + feeds = [ - - (u'Svijet', u'http://rss.dw-world.de/rdf/rss-cro-svijet'), - (u'Europa', u'http://rss.dw-world.de/rdf/rss-cro-eu'), - (u'Njemacka', u'http://rss.dw-world.de/rdf/rss-cro-ger'), - (u'Vijesti', u'http://rss.dw-world.de/rdf/rss-cro-all') + (u'Svijet', u'http://rss.dw-world.de/rdf/rss-cro-svijet'), + (u'Europa', u'http://rss.dw-world.de/rdf/rss-cro-eu'), + (u'Njemacka', u'http://rss.dw-world.de/rdf/rss-cro-ger'), + (u'Vijesti', u'http://rss.dw-world.de/rdf/rss-cro-all') ] - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup diff --git a/recipes/deutsche_welle_pt.recipe b/recipes/deutsche_welle_pt.recipe index 4b9a9ea9dc..aff42efd0a 100644 --- a/recipes/deutsche_welle_pt.recipe +++ b/recipes/deutsche_welle_pt.recipe @@ -1,19 +1,12 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_pt(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Noticias desde Alemania y mundo' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True @@ -21,42 +14,25 @@ class DeutscheWelle_pt(BasicNewsRecipe): publication_type = 'newsportal' remove_empty_feeds = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - body{font-family: Arial,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ + + + remove_javascript = True + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + def preprocess_html(self, soup): + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] + return soup - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-br-all')] - - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - - def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) - return soup diff --git a/recipes/deutsche_welle_ru.recipe b/recipes/deutsche_welle_ru.recipe index 0fbbcc6327..ec4e838af0 100644 --- a/recipes/deutsche_welle_ru.recipe +++ b/recipes/deutsche_welle_ru.recipe @@ -1,24 +1,36 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 - -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle(BasicNewsRecipe): title = u'Deutsche Welle \u043D\u0430 \u0440\u0443\u0441\u0441\u043A\u043E\u043C' description = u'\u0420\u0443\u0441\u0441\u043A\u0430\u044F \u0440\u0435\u0434\u0430\u043A\u0446\u0438\u044F Deutsche Welle: \u043D\u043E\u0432\u043E\u0441\u0442\u0438, \u0430\u043D\u0430\u043B\u0438\u0442\u0438\u043A\u0430, \u043A\u043E\u043C\u043C\u0435\u043D\u0442\u0430\u0440\u0438\u0438 \u0438 \u0440\u0435\u043F\u043E\u0440\u0442\u0430\u0436\u0438 \u0438\u0437 \u0413\u0435\u0440\u043C\u0430\u043D\u0438\u0438 \u0438 \u0415\u0432\u0440\u043E\u043F\u044B, \u043D\u0435\u043C\u0435\u0446\u043A\u0438\u0439 \u0438 \u0435\u0432\u0440\u043E\u043F\u0435\u0439\u0441\u043A\u0438\u0439 \u0432\u0437\u0433\u043B\u044F\u0434 \u043D\u0430 \u0441\u043E\u0431\u044B\u0442\u0438\u044F \u0432 \u0420\u043E\u0441\u0441\u0438\u0438 \u0438 \u043C\u0438\u0440\u0435, \u043F\u0440\u0430\u043A\u0442\u0438\u0447\u0435\u0441\u043A\u0438\u0435 \u0441\u043E\u0432\u0435\u0442\u044B \u0434\u043B\u044F \u0442\u0443\u0440\u0438\u0441\u0442\u043E\u0432 \u0438 \u0442\u0435\u0445, \u043A\u0442\u043E \u0436\u0435\u043B\u0430\u0435\u0442 \u0443\u0447\u0438\u0442\u044C\u0441\u044F \u0438\u043B\u0438 \u0440\u0430\u0431\u043E\u0442\u0430\u0442\u044C \u0432 \u0413\u0435\u0440\u043C\u0430\u043D\u0438\u0438 \u0438 \u0434\u0440\u0443\u0433\u0438\u0445 \u0441\u0442\u0440\u0430\u043D\u0430\u0445 \u0415\u0432\u0440\u043E\u0441\u043E\u044E\u0437\u0430.' # noqa - __author__ = 'bugmen00t' + __author__ = 'bugmen00t, unkn0wn' publication_type = 'newspaper' - oldest_article = 14 + oldest_article = 2 max_articles_per_feed = 100 language = 'ru' - cover_url = 'https://www.dw.com/cssi/dwlogo-print.gif' - auto_cleanup = False - no_stylesheets = False + # cover_url = 'https://www.dw.com/cssi/dwlogo-print.gif' - remove_tags_before = dict(name='h1') + remove_javascript = True + no_stylesheets = True + remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + + def preprocess_html(self, soup): + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] + return soup - remove_tags_after = dict(name='div', attrs={'class': 'longText'}) + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] feeds = [ ( diff --git a/recipes/deutsche_welle_sr.recipe b/recipes/deutsche_welle_sr.recipe index b9c67e4976..7f1b1717c0 100644 --- a/recipes/deutsche_welle_sr.recipe +++ b/recipes/deutsche_welle_sr.recipe @@ -1,20 +1,12 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' -''' -dw-world.de -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class DeutscheWelle_sr(BasicNewsRecipe): title = 'Deutsche Welle' - __author__ = 'Darko Miletic' + __author__ = 'unkn0wn' description = 'Vesti iz Nemacke i sveta' publisher = 'Deutsche Welle' category = 'news, politics, Germany' - oldest_article = 1 + oldest_article = 2 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True @@ -22,55 +14,34 @@ class DeutscheWelle_sr(BasicNewsRecipe): publication_type = 'newsportal' remove_empty_feeds = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' - extra_css = """ - @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - body{font-family: Arial,sans1,sans-serif} - img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} - .caption{font-size: x-small; display: block; margin-bottom: 0.4em} - """ - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } - - remove_tags = [ - dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( - attrs={'class': 'actionFooter'}) - ] - keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] - remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] - - feeds = [ - - (u'Politika', u'http://rss.dw-world.de/rdf/rss-ser-pol'), - (u'Srbija', u'http://rss.dw-world.de/rdf/rss-ser-pol-ser'), - (u'Region', u'http://rss.dw-world.de/rdf/rss-ser-pol-region'), - (u'Evropa', u'http://rss.dw-world.de/rdf/rss-ser-pol-eu'), - (u'Nemacka', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), - (u'Svet', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), - (u'Pregled stampe', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), - (u'Nauka Tehnika Medicina', u'http://rss.dw-world.de/rdf/rss-ser-science'), - (u'Kultura', u'feed:http://rss.dw-world.de/rdf/rss-ser-cul') - ] - - def print_version(self, url): - artl = url.rpartition('/')[2] - return 'http://www.dw-world.de/popups/popup_printcontent/' + artl - + remove_javascript = True + ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['height', 'width', 'style'] + def preprocess_html(self, soup): - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - del item['href'] - item['target'] = '' - del item['target'] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[6] return soup + + keep_only_tags = [ + dict(name='article') + ] + + remove_tags = [ + dict(name=['footer', 'source']), + dict(attrs={'data-tracking-name':'sharing-icons-inline'}), + classes('kicker advertisement vjs-wrapper') + ] + + feeds = [ + (u'Politika', u'http://rss.dw-world.de/rdf/rss-ser-pol'), + (u'Srbija', u'http://rss.dw-world.de/rdf/rss-ser-pol-ser'), + (u'Region', u'http://rss.dw-world.de/rdf/rss-ser-pol-region'), + (u'Evropa', u'http://rss.dw-world.de/rdf/rss-ser-pol-eu'), + (u'Nemacka', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), + (u'Svet', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), + (u'Pregled stampe', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger'), + (u'Nauka Tehnika Medicina', u'http://rss.dw-world.de/rdf/rss-ser-science'), + (u'Kultura', u'feed:http://rss.dw-world.de/rdf/rss-ser-cul') + ] +