Fix #1891 (Updated various recipes for better EPUB support)

2025-07-09 03:04:10 -04:00 · 2009-02-19 10:24:48 -08:00 · 2009-02-19 10:24:48 -08:00 · f179a74a07
commit f179a74a07
parent 3c05e850d5
4 changed files with 58 additions and 56 deletions
--- a/src/calibre/web/feeds/recipes/recipe_harpers.py
+++ b/src/calibre/web/feeds/recipes/recipe_harpers.py
@ -1,29 +1,40 @@
-#!/usr/bin/env  python
-
-__license__   = 'GPL v3'
-__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
-'''
-harpers.org
-'''
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class Harpers(BasicNewsRecipe):
-    title                 = u"Harper's Magazine"
-    __author__            = u'Darko Miletic'
-    language = _('English')
-    description           = u"Harper's Magazine: Founded June 1850."
-    oldest_article        = 30
-    max_articles_per_feed = 100
-    no_stylesheets        = True
-    use_embedded_content  = False
-    timefmt               = ' [%A, %d %B, %Y]' 
-
-    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
-    remove_tags = [
-                     dict(name='table', attrs={'class':'rcnt'})
-                    ,dict(name='table', attrs={'class':'rcnt topline'})
-                  ]
-
-    feeds       = [
-                   (u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')
-                   ]
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
+'''
+harpers.org
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Harpers(BasicNewsRecipe):
+    title                 = u"Harper's Magazine"
+    __author__            = u'Darko Miletic'
+    language              = _('English')
+    description           = u"Harper's Magazine: Founded June 1850."
+    publisher             = "Harper's Magazine "
+    category              = 'news, politics, USA'
+    oldest_article        = 30
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    remove_javascript     = True
+
+    html2lrf_options = [
+                          '--comment', description
+                        , '--category', category
+                        , '--publisher', publisher
+                        ]
+    
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' 
+    
+    
+    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
+    remove_tags = [
+                     dict(name='table', attrs={'class':'rcnt'})
+                    ,dict(name='table', attrs={'class':'rcnt topline'})
+                    ,dict(name=['link','object','embed'])
+                  ]
+
+    feeds       = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')]
+
--- a/src/calibre/web/feeds/recipes/recipe_harpers_full.py
+++ b/src/calibre/web/feeds/recipes/recipe_harpers_full.py
@ -10,8 +10,8 @@ images and pdf's are ignored

 from calibre import strftime

-from calibre.web.feeds.news import BasicNewsRecipe
-
+from calibre.web.feeds.news import BasicNewsRecipe
+
 class Harpers_full(BasicNewsRecipe):
    title                 = u"Harper's Magazine - articles from printed edition"
    __author__            = u'Darko Miletic'
@ -23,7 +23,8 @@ class Harpers_full(BasicNewsRecipe):
    no_stylesheets        = True
    use_embedded_content  = False
    simultaneous_downloads = 1
-    delay = 1
+    delay                  = 1
+    language               = _('English')
    needs_subscription = True
    INDEX = strftime('http://www.harpers.org/archive/%Y/%m')
    LOGIN = 'http://www.harpers.org'
@ -31,12 +32,12 @@ class Harpers_full(BasicNewsRecipe):
    remove_javascript     = True
    
    html2lrf_options = [
-                          '--comment', description
+                          '--comment', description
                        , '--category', category
                        , '--publisher', publisher
                        ]
    
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' 

    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
    remove_tags = [
@ -71,10 +72,4 @@ class Harpers_full(BasicNewsRecipe):
                                 ,'description':''
                                })
        return [(soup.head.title.string, articles)]
-
-    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
-            del item['style']
-        return soup
-        
-    language = _('English')
+        
--- a/src/calibre/web/feeds/recipes/recipe_pobjeda.py
+++ b/src/calibre/web/feeds/recipes/recipe_pobjeda.py
@ -17,9 +17,6 @@ class Pobjeda(BasicNewsRecipe):
    description           = 'News from Montenegro'
    publisher             = 'Pobjeda a.d.'
    category              = 'news, politics, Montenegro'    
-    language              = _('Serbian')
-    oldest_article        = 2
-    max_articles_per_feed = 100
    no_stylesheets        = True
    remove_javascript     = True
    encoding              = 'utf8'
@ -30,12 +27,14 @@ class Pobjeda(BasicNewsRecipe):
    
    html2lrf_options = [
                          '--comment', description
+                        , '--base-font-size', '10'
                        , '--category', category
                        , '--publisher', publisher
                        ]
    
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
-     
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
+
+    
    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]

    keep_only_tags = [dict(name='div', attrs={'class':'vijest'})]
@ -64,8 +63,6 @@ class Pobjeda(BasicNewsRecipe):
        soup.html['lang']     = 'sr-Latn-ME'
        mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>'
        soup.head.insert(0,mtag)
-        for item in soup.findAll(style=True):
-            del item['style']
        return soup

    def get_cover_url(self):
@ -81,16 +78,16 @@ class Pobjeda(BasicNewsRecipe):
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
-            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))             
            articles = []
-            soup = self.index_to_soup(feedurl)
+            soup = self.index_to_soup(feedurl)        
            for item in soup.findAll('div', attrs={'class':'vijest'}):
                description = self.tag_to_string(item.h2)
                atag = item.h1.find('a')
-                if atag:
+                if atag and atag.has_key('href'):
                    url         = self.INDEX + '/' + atag['href']
                    title       = self.tag_to_string(atag)
-                    date        = strftime(self.timefmt)
+                    date        = strftime(self.timefmt)                
                    articles.append({
                                      'title'      :title
                                     ,'date'       :date
--- a/src/calibre/web/feeds/recipes/recipe_pressonline.py
+++ b/src/calibre/web/feeds/recipes/recipe_pressonline.py
@ -32,7 +32,7 @@ class PressOnline(BasicNewsRecipe):
                        , '--publisher', publisher
                        ]
    
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"' 
     
    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]

@ -61,7 +61,6 @@ class PressOnline(BasicNewsRecipe):
        soup.html['lang']     = 'sr-Latn-RS'
        mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
        soup.head.insert(0,mtag)
-        img = soup.find('img')
-        if img:
-           del img['align']
+        for img in soup.findAll('img', align=True):
+            del img['align']
        return soup