From 53204ca19d03ecc603cd108396ec552091e2139e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 11 Nov 2010 13:03:29 -0700 Subject: [PATCH] Fix #7500 (Update for recipe Politika) --- resources/recipes/politika.recipe | 75 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/resources/recipes/politika.recipe b/resources/recipes/politika.recipe index 51c2738862..0cdd971027 100644 --- a/resources/recipes/politika.recipe +++ b/resources/recipes/politika.recipe @@ -1,13 +1,10 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' politika.rs ''' import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class Politika(BasicNewsRecipe): title = 'Politika Online' @@ -19,53 +16,51 @@ class Politika(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - remove_javascript = True encoding = 'utf8' - language = 'sr' - - lang = 'sr-Latn-RS' - direction = 'ltr' - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' + delay = 1 + language = 'sr' + publication_type = 'newspaper' + masthead_url = 'http://static.politika.co.rs/images_new/politika.gif' + extra_css = """ + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Arial,Helvetica,sans1,sans-serif} + h1{font-family: "Times New Roman",Times,serif1,serif} + .articledescription{font-family: sans1, sans-serif} + """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : lang - , 'pretty_print' : True + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [dict(name='div', attrs={'class':'content_center_border'})] - - remove_tags = [ - dict(name='div', attrs={'class':['send_print','txt-komentar']}) - ,dict(name=['object','link','a']) - ,dict(name='h1', attrs={'class':'box_header-tags'}) - ] - - + keep_only_tags = [dict(name='div', attrs={'class':'big_article_home item_details'})] + remove_tags_after = dict(attrs={'class':'online_date'}) + remove_tags = [dict(name=['link','meta','iframe','embed','object'])] + feeds = [ - (u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' ) - ,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' ) - ,(u'Redakcijski komentari', u'http://www.politika.rs/rubrike/redakcijski-komentari/index.1.lt.xml') - ,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' ) - ,(u'Pogledi sa strane' , u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml' ) - ,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' ) - ,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' ) - ,(u'Zivot i stil' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' ) + (u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' ) + ,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' ) + ,(u'Ostali komentari' , u'http://www.politika.rs/rubrike/ostali-komentari/index.1.lt.xml' ) + ,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' ) + ,(u'Pogledi sa strane', u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml') + ,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' ) + ,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' ) + ,(u'Spektar' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' ) ] def preprocess_html(self, soup): - soup.html['lang'] = self.lang - soup.html['dir' ] = self.direction - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - soup.head.insert(0,mlang) for item in soup.findAll(style=True): del item['style'] - ftag = soup.find('div',attrs={'class':'content_center_border'}) - if ftag.has_key('align'): - del ftag['align'] - return self.adeify_images(soup) + for item in soup.findAll('a', attrs={'class':'category'}): + item.name='span' + if item.has_key('href'): + del item['href'] + if item.has_key('title'): + del item['title'] + return soup