__license__ = 'GPL v3' __copyright__ = '2012, Darko Miletic ' ''' www.novilist.hr ''' import re from calibre.web.feeds.news import BasicNewsRecipe class NoviList_Portal_hr(BasicNewsRecipe): title = 'Novi List - online portal' __author__ = 'Darko Miletic' description = 'Portal Novog Lista' publisher = 'NOVI LIST d.d.' category = 'Novi list, politika, hrvatski dnevnik, Novine, Hrvatska, Croatia, News, newspaper, Hrvatski,Primorje, dnevni list, Rijeka' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'hr' publication_type = 'newsportal' masthead_url = 'http://www.novilist.hr/extension/novilist/design/novilist/images/logo-print.gif' extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: Geneva,Arial,Helvetica,Swiss,sans1,sans-serif } h1{font-family: Georgia,serif1,serif} img{display:block; margin-bottom: 0.4em; margin-top: 0.4em} """ preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True } keep_only_tags = [dict(name='div', attrs={'id': 'content'})] remove_tags = [ dict(name=['meta', 'link', 'iframe', 'embed', 'object']), dict(name='div', attrs={ 'class': lambda x: x and 'embed-object' in x.split()}) ] remove_attributes = ['border', 'lang'] feeds = [(u'Vijesti', u'http://www.novilist.hr/rss/feed/sve.xml')] def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) filter = ['/Foto/', '/Informator/'] for item in filter: if item in url: return None return url def print_version(self, url): return url.replace('http://www.novilist.hr/', 'http://www.novilist.hr/layout/set/print/')