__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' dw-world.de ''' import re from calibre.web.feeds.news import BasicNewsRecipe class DeutscheWelle_hr(BasicNewsRecipe): title = 'Deutsche Welle' __author__ = 'Darko Miletic' description = 'Vesti iz Njemacke i svijeta' publisher = 'Deutsche Welle' category = 'news, politics, Germany' oldest_article = 1 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True language = 'hr' publication_type = 'newsportal' remove_empty_feeds = True masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif' extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Arial,sans1,sans-serif} img{margin-top: 0.5em; margin-bottom: 0.2em; display: block} .caption{font-size: x-small; display: block; margin-bottom: 0.4em} """ preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } remove_tags = [ dict(name=['iframe', 'embed', 'object', 'form', 'base', 'meta', 'link']), dict( attrs={'class': 'actionFooter'}) ] keep_only_tags = [dict(attrs={'class': 'ArticleDetail detail'})] remove_attributes = ['height', 'width', 'onclick', 'border', 'lang'] feeds = [ (u'Svijet', u'http://rss.dw-world.de/rdf/rss-cro-svijet'), (u'Europa', u'http://rss.dw-world.de/rdf/rss-cro-eu'), (u'Njemacka', u'http://rss.dw-world.de/rdf/rss-cro-ger'), (u'Vijesti', u'http://rss.dw-world.de/rdf/rss-cro-all') ] def print_version(self, url): artl = url.rpartition('/')[2] return 'http://www.dw-world.de/popups/popup_printcontent/' + artl def preprocess_html(self, soup): for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' del item['href'] item['target'] = '' del item['target'] else: str = self.tag_to_string(item) item.replaceWith(str) return soup