New recipe for US News by Darko Miletic

2025-07-09 03:04:10 -04:00 · 2009-04-30 17:44:23 -07:00 · 2009-04-30 17:44:23 -07:00 · 2ee77796fb
commit 2ee77796fb
parent e869684a29
4 changed files with 125 additions and 48 deletions
--- a/src/calibre/gui2/images/news/usnews.png
+++ b/src/calibre/gui2/images/news/usnews.png
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -41,7 +41,7 @@ recipe_modules = ['recipe_' + r for r in (
           'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
           'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
           'seattle_times', 'scott_hanselman', 'coding_horror',
-           'stackoverflow', 'telepolis_artikel', 'zaobao',
+           'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
          )]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_spiegelde.py
+++ b/src/calibre/web/feeds/recipes/recipe_spiegelde.py
@ -1,47 +1,64 @@
+#!/usr/bin/env  python
+
 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-
+__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
-Fetch Spiegel Online.
+spiegel.de
 '''

-import re
-
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag

-class SpeigelOnline(BasicNewsRecipe):
-
-    title = 'Spiegel Online'
-    description = 'Nachrichten des Magazins Der Spiegel'
-    __author__ = 'Kovid Goyal'
-    use_embedded_content   = False
+class Spiegel_ger(BasicNewsRecipe):
+    title                 = 'Spiegel Online - German'
+    __author__            = 'Darko Miletic'
+    description           = "Immer die neueste Meldung auf dem Schirm, sekundenaktuell und uebersichtlich: Mit dem RSS-Angebot von SPIEGEL ONLINE entgeht Ihnen keine wichtige Meldung mehr, selbst wenn Sie keinen Internet-Browser geoeffnet haben. Sie koennen unsere Nachrichten-Feeds ganz einfach abonnieren - unkompliziert, kostenlos und nach Ihren persoenlichen Themen-Vorlieben."
+    publisher             = 'SPIEGEL ONLINE Gmbh'
+    category              = 'SPIEGEL ONLINE, DER SPIEGEL, Nachrichten, News,Dienste, RSS, RSS, Feedreader, Newsfeed, iGoogle, Netvibes, Widget'    
+    oldest_article        = 7
+    max_articles_per_feed = 100
    language              = _('German')
-    timefmt = ' [ %Y-%m-%d %a]'
-    max_articles_per_feed = 40
+    lang                  = 'de-DE'
    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'cp1252'
    
-    preprocess_regexps = \
-        [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
-            [
-             # Remove Zum Thema footer
-             (r'<div class="spArticleCredit.*?</body>', lambda match: '</body>'),
-             ]
+    html2lrf_options = [
+                          '--comment', description
+                        , '--category', category
+                        , '--publisher', publisher
                        ]

-    feeds= [ ('Spiegel Online', 'http://www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml') ]
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
                        
+    keep_only_tags = [dict(name='div', attrs={'id':'spMainContent'})]
+
+    remove_tags = [dict(name=['object','link','base'])]
+    
+    remove_tags_after = dict(name='div', attrs={'id':'spArticleBody'})
+
+    feeds          = [(u'Spiegel Online', u'http://www.spiegel.de/schlagzeilen/index.rss')]

    def print_version(self, url):
-        tokens = url.split(',')
-        tokens[-2:-2] = ['druck|']
-        return ','.join(tokens).replace('|,','-')
-
-    def postprocess_html(self, soup, first_fetch):
-        if soup.contents[0].name == 'head':
-            x = BeautifulSoup('<html></html>')
-            for y in reversed(soup.contents):
-                x.contents[0].insert(0, y)
-            soup = x
+        main, sep, rest = url.rpartition(',')
+        rmain, rsep, rrest = main.rpartition(',')
+        return rmain + ',druck-' + rrest + ',' + rest

+    def preprocess_html(self, soup):
+        mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
+        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
+        soup.head.insert(0,mlang)
+        soup.head.insert(1,mcharset)
+        for item in soup.findAll(style=True):
+            del item['style']
+        htmltag = soup.find('html')
+        if not htmltag:
+            thtml = Tag(soup,'html',[("lang",self.lang),("xml:lang",self.lang),("dir","ltr")])
+            soup.insert(0,thtml)
+            thead = soup.head
+            tbody = soup.body
+            thead.extract()
+            tbody.extract()
+            soup.html.insert(0,tbody)
+            soup.html.insert(0,thead)
        return soup
--- a/src/calibre/web/feeds/recipes/recipe_usnews.py
+++ b/src/calibre/web/feeds/recipes/recipe_usnews.py
@ -0,0 +1,60 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.usnews.com
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class LaPrensa(BasicNewsRecipe):
+    title                 = 'US & World Report news'
+    __author__            = 'Darko Miletic'
+    description           = 'News from USA and world'
+    publisher             = 'U.S.News & World Report, L.P.'
+    category              = 'news, politics, USA'
+    oldest_article        = 2
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'utf-8'
+    language              = _('English')
+
+    html2lrf_options = [
+                          '--comment', description
+                        , '--category', category
+                        , '--publisher', publisher
+                        ]
+    
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+
+    keep_only_tags = [
+                        dict(name='h1')
+                       ,dict(name='div', attrs={'id':['dateline']})
+                       ,dict(name='div', attrs={'class':['blogCredit','body']})
+                     ]
+                     
+    feeds = [ 
+              (u'Homepage'        , u'http://www.usnews.com/rss/usnews.rss'          )
+             ,(u'Health'          , u'http://www.usnews.com/rss/health/index.rss'    )
+             ,(u'Nation & World'  , u'http://www.usnews.com/rss/news/index.rss'      )
+             ,(u'Money & Business', u'http://www.usnews.com/rss/business/index.rss'  )
+             ,(u'Education'       , u'http://www.usnews.com/rss/education/index.rss' )
+             ,(u'Opinion'         , u'http://www.usnews.com/rss/opinion/index.rss'   )
+             ,(u'Science'         , u'http://www.usnews.com/rss/science/index.rss'   )
+            ]
+
+    def print_version(self, url):
+        return url.replace('.html','_print.html')
+
+    def get_article_url(self, article):
+        raw = article.get('link',  None)
+        artcl, sep, unneeded = raw.rpartition('?')
+        return artcl
+
+    def preprocess_html(self, soup):
+        del soup.body['onload']
+        for item in soup.findAll(style=True):
+            del item['style']
+        return soup
+