From f179a74a079ca8904333e860ba81b233039e8b40 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 19 Feb 2009 10:24:48 -0800
Subject: [PATCH] Fix #1891 (Updated various recipes for better EPUB support)

---
 .../web/feeds/recipes/recipe_harpers.py       | 69 +++++++++++--------
 .../web/feeds/recipes/recipe_harpers_full.py  | 19 ++---
 .../web/feeds/recipes/recipe_pobjeda.py       | 19 +++--
 .../web/feeds/recipes/recipe_pressonline.py   |  7 +-
 4 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/src/calibre/web/feeds/recipes/recipe_harpers.py b/src/calibre/web/feeds/recipes/recipe_harpers.py
index e15263730d..6370f6e0ea 100644
--- a/src/calibre/web/feeds/recipes/recipe_harpers.py
+++ b/src/calibre/web/feeds/recipes/recipe_harpers.py
@@ -1,29 +1,40 @@
-#!/usr/bin/env  python
-
-__license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
-'''
-harpers.org
-'''
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class Harpers(BasicNewsRecipe):
-    title                 = u"Harper's Magazine"
-    __author__            = u'Darko Miletic'
-    language = _('English')
-    description           = u"Harper's Magazine: Founded June 1850."
-    oldest_article        = 30
-    max_articles_per_feed = 100
-    no_stylesheets        = True
-    use_embedded_content  = False
-    timefmt               = ' [%A, %d %B, %Y]' 
-
-    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
-    remove_tags = [
-                     dict(name='table', attrs={'class':'rcnt'})
-                    ,dict(name='table', attrs={'class':'rcnt topline'})
-                  ]
-
-    feeds       = [
-                   (u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')
-                   ]
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
+'''
+harpers.org
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Harpers(BasicNewsRecipe):
+    title                 = u"Harper's Magazine"
+    __author__            = u'Darko Miletic'
+    language              = _('English')
+    description           = u"Harper's Magazine: Founded June 1850."
+    publisher             = "Harper's Magazine "
+    category              = 'news, politics, USA'
+    oldest_article        = 30
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    remove_javascript     = True
+
+    html2lrf_options = [
+                          '--comment', description
+                        , '--category', category
+                        , '--publisher', publisher
+                        ]
+    
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' 
+    
+    
+    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
+    remove_tags = [
+                     dict(name='table', attrs={'class':'rcnt'})
+                    ,dict(name='table', attrs={'class':'rcnt topline'})
+                    ,dict(name=['link','object','embed'])
+                  ]
+
+    feeds       = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')]
+
diff --git a/src/calibre/web/feeds/recipes/recipe_harpers_full.py b/src/calibre/web/feeds/recipes/recipe_harpers_full.py
index 72e633bde0..69ec9d54f5 100644
--- a/src/calibre/web/feeds/recipes/recipe_harpers_full.py
+++ b/src/calibre/web/feeds/recipes/recipe_harpers_full.py
@@ -10,8 +10,8 @@ images and pdf's are ignored
 
 from calibre import strftime
 
-from calibre.web.feeds.news import BasicNewsRecipe
-
+from calibre.web.feeds.news import BasicNewsRecipe
+
 class Harpers_full(BasicNewsRecipe):
     title                 = u"Harper's Magazine - articles from printed edition"
     __author__            = u'Darko Miletic'
@@ -23,7 +23,8 @@ class Harpers_full(BasicNewsRecipe):
     no_stylesheets        = True
     use_embedded_content  = False
     simultaneous_downloads = 1
-    delay = 1
+    delay                  = 1
+    language               = _('English')
     needs_subscription = True
     INDEX = strftime('http://www.harpers.org/archive/%Y/%m')
     LOGIN = 'http://www.harpers.org'
@@ -31,12 +32,12 @@ class Harpers_full(BasicNewsRecipe):
     remove_javascript     = True
     
     html2lrf_options = [
-                          '--comment', description
+                          '--comment', description
                         , '--category', category
                         , '--publisher', publisher
                         ]
     
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' 
 
     keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
     remove_tags = [
@@ -71,10 +72,4 @@ class Harpers_full(BasicNewsRecipe):
                                  ,'description':''
                                 })
         return [(soup.head.title.string, articles)]
-
-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        return soup
-        
-    language = _('English')
\ No newline at end of file
+        
diff --git a/src/calibre/web/feeds/recipes/recipe_pobjeda.py b/src/calibre/web/feeds/recipes/recipe_pobjeda.py
index 9a4dbb0eee..5afb2b3f6a 100644
--- a/src/calibre/web/feeds/recipes/recipe_pobjeda.py
+++ b/src/calibre/web/feeds/recipes/recipe_pobjeda.py
@@ -17,9 +17,6 @@ class Pobjeda(BasicNewsRecipe):
     description           = 'News from Montenegro'
     publisher             = 'Pobjeda a.d.'
     category              = 'news, politics, Montenegro'    
-    language              = _('Serbian')
-    oldest_article        = 2
-    max_articles_per_feed = 100
     no_stylesheets        = True
     remove_javascript     = True
     encoding              = 'utf8'
@@ -30,12 +27,14 @@ class Pobjeda(BasicNewsRecipe):
     
     html2lrf_options = [
                           '--comment', description
+                        , '--base-font-size', '10'
                         , '--category', category
                         , '--publisher', publisher
                         ]
     
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
-     
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
+
+    
     preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
 
     keep_only_tags = [dict(name='div', attrs={'class':'vijest'})]
@@ -64,8 +63,6 @@ class Pobjeda(BasicNewsRecipe):
         soup.html['lang']     = 'sr-Latn-ME'
         mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>'
         soup.head.insert(0,mtag)
-        for item in soup.findAll(style=True):
-            del item['style']
         return soup
 
     def get_cover_url(self):
@@ -81,16 +78,16 @@ class Pobjeda(BasicNewsRecipe):
         lfeeds = self.get_feeds()
         for feedobj in lfeeds:
             feedtitle, feedurl = feedobj
-            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))             
             articles = []
-            soup = self.index_to_soup(feedurl)
+            soup = self.index_to_soup(feedurl)        
             for item in soup.findAll('div', attrs={'class':'vijest'}):
                 description = self.tag_to_string(item.h2)
                 atag = item.h1.find('a')
-                if atag:
+                if atag and atag.has_key('href'):
                     url         = self.INDEX + '/' + atag['href']
                     title       = self.tag_to_string(atag)
-                    date        = strftime(self.timefmt)
+                    date        = strftime(self.timefmt)                
                     articles.append({
                                       'title'      :title
                                      ,'date'       :date
diff --git a/src/calibre/web/feeds/recipes/recipe_pressonline.py b/src/calibre/web/feeds/recipes/recipe_pressonline.py
index 41525cfc5f..71f69b9169 100644
--- a/src/calibre/web/feeds/recipes/recipe_pressonline.py
+++ b/src/calibre/web/feeds/recipes/recipe_pressonline.py
@@ -32,7 +32,7 @@ class PressOnline(BasicNewsRecipe):
                         , '--publisher', publisher
                         ]
     
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' 
      
     preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
 
@@ -61,7 +61,6 @@ class PressOnline(BasicNewsRecipe):
         soup.html['lang']     = 'sr-Latn-RS'
         mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
         soup.head.insert(0,mtag)
-        img = soup.find('img')
-        if img:
-           del img['align']
+        for img in soup.findAll('img', align=True):
+            del img['align']
         return soup        
\ No newline at end of file