__license__ = 'GPL v3' __copyright__ = '2011, Darko Miletic ' ''' www.standard.rs ''' import re from calibre.web.feeds.news import BasicNewsRecipe class NoviStandard(BasicNewsRecipe): title = 'Novi Standard' __author__ = 'Darko Miletic' description = 'NoviStandard - energija je neunistiva!' publisher = 'Novi Standard' category = 'news, politics, Serbia' no_stylesheets = True delay = 1 oldest_article = 15 encoding = 'utf-8' publication_type = 'magazine' needs_subscription = 'optional' remove_empty_feeds = True INDEX = 'http://www.standard.rs/' use_embedded_content = False language = 'sr' publication_type = 'magazine' masthead_url = 'http://www.standard.rs/templates/ja_opal/images/red/logo.png' extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Arial,"Segoe UI","Trebuchet MS",Helvetica,sans1,sans-serif} .dropcap{font-family: Georgia,Times,serif1,serif; display:inline} .dropcap:first-letter{display: inline; font-size: xx-large; font-weight: bold} .contentheading{color: gray; font-size: x-large} .article-meta, .createdby{color: red} img{margin-top:0.5em; margin-bottom: 0.7em; display: block} """ conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) if self.username is not None and self.password is not None: br.select_form(name='login') br['username'] = self.username br['passwd'] = self.password br.submit() return br keep_only_tags = [ dict(attrs={'class': ['contentheading', 'article-meta', 'article-content']})] remove_tags_after = dict(attrs={'class': 'extravote-container'}) remove_tags = [ dict(name=['object', 'link', 'iframe', 'meta', 'base']), dict( attrs={'class': 'extravote-container'}) ] remove_attributes = ['border', 'background', 'height', 'width', 'align', 'valign', 'lang'] feeds = [ (u'Naslovna', u'http://www.standard.rs/index.php?format=feed&type=rss'), (u'Politika', u'http://www.standard.rs/vesti/36-politika.html?format=feed&type=rss'), (u'Cvijanovic preporucuje', u'http://www.standard.rs/-cvijanovi-vam-preporuuje.html?format=feed&type=rss'), (u'Kolumne', u'http://www.standard.rs/vesti/49-kolumne.html?format=feed&type=rss'), (u'Kultura', u'http://www.standard.rs/vesti/40-kultura.html?format=feed&type=rss'), (u'Lifestyle', u'http://www.standard.rs/vesti/39-lifestyle.html?format=feed&type=rss'), (u'Svet', u'http://www.standard.rs/vesti/41-svet.html?format=feed&type=rss'), (u'Ekonomija', u'http://www.standard.rs/vesti/37-ekonomija.html?format=feed&type=rss'), (u'Sport', u'http://www.standard.rs/vesti/38-sport.html?format=feed&type=rss') ] def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('div'): if len(item.contents) == 0: item.extract() for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img'): if not item.has_key('alt'): # noqa item['alt'] = 'image' return soup