New recipe for US News by Darko Miletic

2025-07-09 03:04:10 -04:00 · 2009-04-30 17:44:23 -07:00 · 2009-04-30 17:44:23 -07:00 · 2ee77796fb
commit 2ee77796fb
parent e869684a29
4 changed files with 125 additions and 48 deletions
--- a/src/calibre/gui2/images/news/usnews.png
+++ b/src/calibre/gui2/images/news/usnews.png
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -41,7 +41,7 @@ recipe_modules = ['recipe_' + r for r in (
           'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
           'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
           'seattle_times', 'scott_hanselman', 'coding_horror',
-           'stackoverflow', 'telepolis_artikel', 'zaobao',
+           'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
          )]
 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_spiegelde.py
+++ b/src/calibre/web/feeds/recipes/recipe_spiegelde.py
@ -1,47 +1,64 @@
-__license__   = 'GPL v3'
+#!/usr/bin/env  python
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
-
+__license__   = 'GPL v3'
-'''
+__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
-Fetch Spiegel Online.
+'''
-'''
+spiegel.de
-
+'''
-import re
+
-
+from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
-
+class Spiegel_ger(BasicNewsRecipe):
-class SpeigelOnline(BasicNewsRecipe):
+    title                 = 'Spiegel Online - German'
-
+    __author__            = 'Darko Miletic'
-    title = 'Spiegel Online'
+    description           = "Immer die neueste Meldung auf dem Schirm, sekundenaktuell und uebersichtlich: Mit dem RSS-Angebot von SPIEGEL ONLINE entgeht Ihnen keine wichtige Meldung mehr, selbst wenn Sie keinen Internet-Browser geoeffnet haben. Sie koennen unsere Nachrichten-Feeds ganz einfach abonnieren - unkompliziert, kostenlos und nach Ihren persoenlichen Themen-Vorlieben."
-    description = 'Nachrichten des Magazins Der Spiegel'
+    publisher             = 'SPIEGEL ONLINE Gmbh'
-    __author__ = 'Kovid Goyal'
+    category              = 'SPIEGEL ONLINE, DER SPIEGEL, Nachrichten, News,Dienste, RSS, RSS, Feedreader, Newsfeed, iGoogle, Netvibes, Widget'    
-    use_embedded_content   = False
+    oldest_article        = 7
-    language = _('German')
+    max_articles_per_feed = 100
-    timefmt = ' [ %Y-%m-%d %a]'
+    language              = _('German')
-    max_articles_per_feed = 40
+    lang                  = 'de-DE'
-    no_stylesheets = True
+    no_stylesheets        = True
-
+    use_embedded_content  = False
-    preprocess_regexps = \
+    encoding              = 'cp1252'
-        [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
+    
-            [
+    html2lrf_options = [
-             # Remove Zum Thema footer
+                          '--comment', description
-             (r'<div class="spArticleCredit.*?</body>', lambda match: '</body>'),
+                        , '--category', category
-             ]
+                        , '--publisher', publisher
-            ]
+                        ]
-
+
-    feeds= [ ('Spiegel Online', 'http://www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml') ]
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
-
+                        
-
+    keep_only_tags = [dict(name='div', attrs={'id':'spMainContent'})]
-    def print_version(self,url):
+
-        tokens = url.split(',')
+    remove_tags = [dict(name=['object','link','base'])]
-        tokens[-2:-2] = ['druck|']
+    
-        return ','.join(tokens).replace('|,','-')
+    remove_tags_after = dict(name='div', attrs={'id':'spArticleBody'})
-
+
-    def postprocess_html(self, soup, first_fetch):
+    feeds          = [(u'Spiegel Online', u'http://www.spiegel.de/schlagzeilen/index.rss')]
-        if soup.contents[0].name == 'head':
+
-            x = BeautifulSoup('<html></html>')
+    def print_version(self, url):
-            for y in reversed(soup.contents):
+        main, sep, rest = url.rpartition(',')
-                x.contents[0].insert(0, y)
+        rmain, rsep, rrest = main.rpartition(',')
-            soup = x
+        return rmain + ',druck-' + rrest + ',' + rest
-
+
-        return soup
+    def preprocess_html(self, soup):
        mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
        soup.head.insert(0,mlang)
        soup.head.insert(1,mcharset)
        for item in soup.findAll(style=True):
            del item['style']
        htmltag = soup.find('html')
        if not htmltag:
            thtml = Tag(soup,'html',[("lang",self.lang),("xml:lang",self.lang),("dir","ltr")])
            soup.insert(0,thtml)
            thead = soup.head
            tbody = soup.body
            thead.extract()
            tbody.extract()
            soup.html.insert(0,tbody)
            soup.html.insert(0,thead)
        return soup
--- a/src/calibre/web/feeds/recipes/recipe_usnews.py
+++ b/src/calibre/web/feeds/recipes/recipe_usnews.py
@ -0,0 +1,60 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.usnews.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class LaPrensa(BasicNewsRecipe):
    title                 = 'US & World Report news'
    __author__            = 'Darko Miletic'
    description           = 'News from USA and world'
    publisher             = 'U.S.News & World Report, L.P.'
    category              = 'news, politics, USA'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
    language              = _('English')
    html2lrf_options = [
                          '--comment', description
                        , '--category', category
                        , '--publisher', publisher
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
    keep_only_tags = [
                        dict(name='h1')
                       ,dict(name='div', attrs={'id':['dateline']})
                       ,dict(name='div', attrs={'class':['blogCredit','body']})
                     ]
    feeds = [ 
              (u'Homepage'        , u'http://www.usnews.com/rss/usnews.rss'          )
             ,(u'Health'          , u'http://www.usnews.com/rss/health/index.rss'    )
             ,(u'Nation & World'  , u'http://www.usnews.com/rss/news/index.rss'      )
             ,(u'Money & Business', u'http://www.usnews.com/rss/business/index.rss'  )
             ,(u'Education'       , u'http://www.usnews.com/rss/education/index.rss' )
             ,(u'Opinion'         , u'http://www.usnews.com/rss/opinion/index.rss'   )
             ,(u'Science'         , u'http://www.usnews.com/rss/science/index.rss'   )
            ]
    def print_version(self, url):
        return url.replace('.html','_print.html')
    def get_article_url(self, article):
        raw = article.get('link',  None)
        artcl, sep, unneeded = raw.rpartition('?')
        return artcl
    def preprocess_html(self, soup):
        del soup.body['onload']
        for item in soup.findAll(style=True):
            del item['style']
        return soup