...

2025-07-09 03:04:10 -04:00 · 2011-05-15 08:34:10 -06:00 · 2011-05-15 08:34:10 -06:00 · 902dc7aad6
commit 902dc7aad6
parent e149160e9a
3 changed files with 97 additions and 0 deletions
--- a/recipes/bild_de.recipe
+++ b/recipes/bild_de.recipe
@ -0,0 +1,46 @@
 # -*- coding: utf-8 -*-
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class AdvancedUserRecipe1303841067(BasicNewsRecipe):
    title          = u'Bild.de'
    __author__  = 'schuster'
    oldest_article = 1
    max_articles_per_feed = 50
    no_stylesheets         = True
    use_embedded_content   = False
    language               = 'de'
    remove_javascript      = True
 # get cover from myspace
    cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'
 # set what to fetch on the site
    remove_tags_before =  dict(name = 'h2', attrs={'id':'cover'})
    remove_tags_after = dict(name ='div', attrs={'class':'back'})
 #  thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code)
 # this one removes a lot of direct-link's
    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup
 # remove the ad's
    filter_regexps = [r'.\.smartadserver\.com']
    def skip_ad_pages(self, soup):
        return None
 #get the real url behind  .feedsportal.com and fetch the artikels
    def get_article_url(self, article):
        return article.get('id', article.get('guid', None))
 #list of the rss source from www.bild.de
    feeds          = [(u'Überblick', u'http://rss.bild.de/bild.xml'),
                          (u'News', u'http://rss.bild.de/bild-news.xml'),
                          (u'Politik', u'http://rss.bild.de/bild-politik.xml'),
                          (u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'),
                          (u'Sport', u'http://rss.bild.de/bild-sport.xml'),
                          (u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'),
                          (u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml')
 ]
--- a/recipes/max_planck.recipe
+++ b/recipes/max_planck.recipe
@ -0,0 +1,22 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class AdvancedUserRecipe1303841067(BasicNewsRecipe):
    title          = u'Max-Planck-Inst.'
    __author__  = 'schuster'
    remove_tags = [dict(attrs={'class':['clearfix', 'lens', 'col2_box_list', 'col2_box_teaser group_ext no_print', 'dotted_line', 'col2_box_teaser', 'box_image small', 'bold', 'col2_box_teaser no_print', 'print_kontakt']}),
                dict(id=['ie_clearing', 'col2', 'col2_content']),
                dict(name=['script', 'noscript', 'style'])]
    oldest_article = 30
    max_articles_per_feed = 100
    no_stylesheets         = True
    use_embedded_content   = False
    language               = 'de'
    remove_javascript      = True
    def print_version(self, url):
        split_url = url.split("/")
        print_url = 'http://www.mpg.de/print/' +  split_url[3]
        return print_url
    feeds          = [(u'Forschung', u'http://www.mpg.de/de/forschung.rss')]
--- a/recipes/ngz.recipe
+++ b/recipes/ngz.recipe
@ -0,0 +1,29 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class AdvancedUserRecipe1303841067(BasicNewsRecipe):
    title          = u'NGZ-online'
    __author__  = 'schuster'
    remove_tags_before = dict(id='bu')
    remove_tags_after  = dict(id='noblock')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix', 'liketext']}),
                dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'Verlinken', 'vorheriger', 'LESERKOMMENTARE', 'bei facebook', 'bei twitter', 'Schreiben Sie jetzt Ihre Meinung:', 'Thema', 'Ihr Beitrag', 'Ihr Name', 'Ich möchte über weitere Lesermeinungen zu diesem Artikel per E-Mail informiert werden.', 'banneroben', 'bannerrechts', 'inserieren', 'stellen', 'auto', 'immobilien', 'kleinanzeige', 'tiere', 'ferienwohnung', 'NGZ Card', 'Mediengruppe RP', 'Werben', 'Newsletter', 'Wetter', 'RSS', 'Abo', 'Anzeigen', 'Redaktion', 'Schulprojekte', 'Gast', 'Mein NGZ', 'Nachrichten', 'Sport', 'Wirtschaft', 'Stadt-Infos', 'Bilderserien', 'Bookmarken', 'del.icio.us', 'Mister Wong', 'YiGG', 'Webnews', 'Shortnews', 'Twitter', 'Newsider', 'Facebook', 'StudiVZ/MeinVZ', 'Versenden', 'Drucken']),
                dict(name=['script', 'noscript', 'style'])]
    oldest_article = 7
    max_articles_per_feed = 100
    no_stylesheets         = True
    use_embedded_content   = False
    language               = 'de'
    remove_javascript      = True
    cover_url = 'http://www.rhein-kreis-neuss-macht-sport.de/sport/includes/bilder/ngz_logo.jpg'
    def print_version(self, url):
          return url + '?ot=de.circit.rpo.PopupPageLayout.ot'
    feeds          = [
 (u'Grevenbroich', u'http://www.ngz-online.de/app/feed/rss/grevenbroich'),
 (u'Kreis Neuss', u'http://www.ngz-online.de/app/feed/rss/rheinkreisneuss'),
 (u'Dormagen', u'http://www.ngz-online.de/app/feed/rss/dormagen'),
 (u'J\xfcchen', u'http://www.ngz-online.de/app/feed/rss/juechen'),
 (u'Rommerskirchen', u'http://www.ngz-online.de/app/feed/rss/rommerskirchen')
 ]