Pull from trunk

2025-11-18 04:23:02 -05:00 · 2010-09-29 20:09:44 -06:00 · 2010-09-29 20:09:44 -06:00 · c68d1716a0
commit c68d1716a0
parent 05cf311555 9ddd785824
2 changed files with 71 additions and 5 deletions
--- a/resources/recipes/peterschiff.recipe
+++ b/resources/recipes/peterschiff.recipe
@ -12,15 +12,18 @@ class PeterSchiff(BasicNewsRecipe):
    description           = 'Economic commentary'
    publisher             = 'Euro Pacific capital'
    category              = 'news, politics, economy, USA'
-    oldest_article        = 15
+    oldest_article        = 25
    max_articles_per_feed = 200
    no_stylesheets        = True
-    encoding              = 'cp1252'
+    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en'
-    country               = 'US'
    remove_empty_feeds    = True
-    extra_css             = ' body{font-family: Verdana,Times,serif } h1{text-align: left} img{margin-bottom: 0.4em}  '
+    extra_css             = """ 
+                                body{font-family: Verdana,Times,serif } 
+                                .field-field-commentary-writer-name{font-weight: bold}
+                                .field-items{display: inline}
+                            """

    conversion_options = {
                          'comment'   : description
@ -30,7 +33,15 @@ class PeterSchiff(BasicNewsRecipe):
                        , 'linearize_tables' : True
                        }

-    keep_only_tags = [dict(name='tr',attrs={'style':'vertical-align: top;'})]
+    keep_only_tags = [
+                        dict(name='h2',attrs={'id':'page-title'})
+                       ,dict(name='div',attrs={'class':'node'})
+                     ]
+    remove_tags = [
+                    dict(name=['meta','link','base','iframe','embed'])                   
+                   ,dict(attrs={'id':'text-zoom'})
+                  ]
+    remove_attributes=['track','linktype','lang']


    feeds = [(u'Articles', u'http://feeds.feedburner.com/PeterSchiffsEconomicCommentary')]
--- a/resources/recipes/rmf24_opinie.recipe
+++ b/resources/recipes/rmf24_opinie.recipe
@ -0,0 +1,55 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
+'''
+rmf24.pl
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class RMF24_opinie(BasicNewsRecipe):
+    title          = u'Rmf24.pl - Opinie'
+    description    = u'Blogi, wywiady i komentarze ze strony rmf24.pl'
+    language = 'pl'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    __author__ = u'Tomasz D\u0142ugosz'
+    no_stylesheets = True
+    remove_javascript = True
+
+    feeds          = [(u'Blogi', u'http://www.rmf24.pl/opinie/blogi/feed'),
+                      (u'Kontrwywiad', u'http://www.rmf24.pl/opinie/wywiady/kontrwywiad/feed'),
+                      (u'Przes\u0142uchanie', u'http://www.rmf24.pl/opinie/wywiady/przesluchanie/feed'),
+                      (u'Komentarze', u'http://www.rmf24.pl/opinie/komentarze/feed')]
+
+    keep_only_tags = [
+        dict(name='div', attrs={'class':'box articleSingle print'}),
+        dict(name='div', attrs={'class':'box articleSingle print singleCommentary'}),
+        dict(name='div', attrs={'class':'box articleSingle print blogSingleEntry'})]
+
+    remove_tags = [
+        dict(name='div', attrs={'class':'toTop'}),
+        dict(name='div', attrs={'class':'category'}),
+        dict(name='div', attrs={'class':'REMOVE'}),
+        dict(name='div', attrs={'class':'embed embedAd'})]
+
+    extra_css = '''
+        h1 { font-size: 1.2em; }
+    '''
+
+    # thanks to Kovid Goyal
+    def get_article_url(self, article):
+        link = article.get('link')
+        if 'audio' not in link:
+            return link
+
+    preprocess_regexps = [
+        (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+        [
+            (r'<h2>Zdj.cie</h2>', lambda match: ''),
+            (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'),
+            (r'<a href="http://www.facebook.com/pages/RMF24pl/.*?>RMF24.pl</a> on Facebook</div>', lambda match: '</div>')
+        ]
+    ]