Updated NSPM. Fixes #762130 (Updated recipe for NSPM)

2025-11-26 16:25:02 -05:00 · 2011-04-15 13:58:27 -06:00 · 2011-04-15 13:58:27 -06:00 · ab26bbe47e
commit ab26bbe47e
parent 9e3feef720
1 changed files with 70 additions and 31 deletions
--- a/recipes/nspm.recipe
+++ b/recipes/nspm.recipe
@ -1,12 +1,12 @@
 __license__   = 'GPL v3'
-__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 nspm.rs
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import NavigableString
+from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 class Nspm(BasicNewsRecipe):
    title                 = 'Nova srpska politicka misao'
@ -21,7 +21,6 @@ class Nspm(BasicNewsRecipe):
    INDEX                 = 'http://www.nspm.rs/?alphabet=l'
    encoding              = 'utf-8'
    language              = 'sr'
    delay                 = 2
    remove_empty_feeds    = True
    publication_type      = 'magazine'
    masthead_url          = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
@ -29,27 +28,21 @@ class Nspm(BasicNewsRecipe):
                                @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
                                body{font-family: "Times New Roman", serif1, serif}
                                .article_description{font-family: Arial, sans1, sans-serif}
-                                img{margin-top:0.5em; margin-bottom: 0.7em}
+                                img{margin-top:0.5em; margin-bottom: 0.7em; display: block}
                                .author{color: #990000; font-weight: bold}
                                .author,.createdate{font-size: 0.9em} """
    conversion_options = {
-                          'comment'          : description
+                          'comment'      : description
-                        , 'tags'             : category
+                        , 'tags'         : category
-                        , 'publisher'        : publisher
+                        , 'publisher'    : publisher
-                        , 'language'         : language
+                        , 'language'     : language
-                        , 'linearize_tables' : True
+                        , 'pretty_print' : True
                        }
    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
-    keep_only_tags = [dict(attrs={'id':'jsn-mainbody'})]
+    remove_tags        = [dict(name=['link','script','meta','base','img'])]
-    remove_tags        = [
+    remove_attributes  = ['width','height','lang','xmlns:fb','xmlns:og','vspace','hspace','type','start','size']
                           dict(name=['link','object','embed','script','meta','base','iframe'])
                          ,dict(attrs={'class':'buttonheading'})
                         ]
    remove_tags_before = dict(attrs={'class':'contentheading'})
    remove_tags_after  = dict(attrs={'class':'article_separator'})
    remove_attributes  = ['width','height']
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
@ -57,21 +50,67 @@ class Nspm(BasicNewsRecipe):
        return br
    feeds = [
-                 (u'Rubrike'      , u'http://www.nspm.rs/rubrike/feed/rss.html')
+                 (u'Rubrike'                 , u'http://www.nspm.rs/rubrike/feed/rss.html'                 )
-                ,(u'Debate'       , u'http://www.nspm.rs/debate/feed/rss.html')
+                ,(u'Debate'                  , u'http://www.nspm.rs/debate/feed/rss.html'                  )
-                ,(u'Reci i misli' , u'http://www.nspm.rs/reci-i-misli/feed/rss.html')
+                ,(u'Reci i misli'            , u'http://www.nspm.rs/reci-i-misli/feed/rss.html'            )
                ,(u'Samo smeh srbina spasava', u'http://www.nspm.rs/samo-smeh-srbina-spasava/feed/rss.html')
-                ,(u'Polemike'     , u'http://www.nspm.rs/polemike/feed/rss.html')
+                ,(u'Polemike'                , u'http://www.nspm.rs/polemike/feed/rss.html'                )
-                ,(u'Prikazi'   , u'http://www.nspm.rs/prikazi/feed/rss.html')
+                ,(u'Prikazi'                 , u'http://www.nspm.rs/prikazi/feed/rss.html'                 )
-                ,(u'Prenosimo'   , u'http://www.nspm.rs/prenosimo/feed/rss.html')
+                ,(u'Prenosimo'               , u'http://www.nspm.rs/prenosimo/feed/rss.html'               )
-                ,(u'Hronika'   , u'http://www.nspm.rs/tabela/hronika/feed/rss.html')
+                ,(u'Hronika'                 , u'http://www.nspm.rs/tabela/hronika/feed/rss.html'          )
            ]
    def preprocess_html(self, soup):
-        for item in soup.body.findAll(style=True):
+        atitle = soup.body.find('a',attrs={'class':'contentpagetitle'})
-            del item['style']
+        if atitle:
-        for item in soup.body.findAll('h1'):
+           cleanTitle = Tag(soup,'h1',[('class','contentpagetitle')])
-            nh = NavigableString(item.a.string)
+           cnt        = NavigableString(self.tag_to_string(atitle))
-            item.a.extract()
+           cleanTitle.append(cnt)
-            item.insert(0,nh)
+           
-        return self.adeify_images(soup)
+        author = soup.body.find('span',attrs={'class':'author'})
        if author:
           author.extract()
           author.name = 'div'
        crdate = soup.body.find('td',attrs={'class':'createdate'})
        if crdate:
           cleanCrdate = Tag(soup,'div',[('class','createdate')])
           cnt         = NavigableString(self.tag_to_string(crdate))
           cleanCrdate.append(cnt)
           #get the dependant element
           artText = Tag(soup,'div',[('class','text')])
           textHolderp = crdate.parent
           textHolder = textHolderp.nextSibling
           while textHolder and (not isinstance(textHolder,Tag) or (textHolder.name <> textHolderp.name)):
                 textHolder = textHolder.nextSibling
           if textHolder.td:
              artText          = textHolder.td
              artText.name     = 'div'
              artText.attrs    = []
              artText['class'] = 'text'
              artText.extract()
           soup.body.contents=[]
           soup.body.append(cleanTitle)
           soup.body.append(author)
           soup.body.append(cleanCrdate)
           soup.body.append(artText)
        for item in soup.findAll('a'):
            limg = item.find('img')
            if item.string is not None:
               str = item.string
               item.replaceWith(str)
            else:
               if limg:
                  item.name = 'div'
                  item.attrs = []
               else:
                   str = self.tag_to_string(item)
                   item.replaceWith(str)
        for item in soup.findAll('img'):
            if not item.has_key('alt'):
               item['alt'] = 'image'
        return soup