...

2025-07-09 03:04:10 -04:00 · 2011-10-02 08:24:11 -06:00 · 2011-10-02 08:24:11 -06:00 · 66ea17fe86
commit 66ea17fe86
parent c987d56c8e
3 changed files with 18 additions and 4 deletions
--- a/recipes/cgm_pl.recipe
+++ b/recipes/cgm_pl.recipe
@ -9,9 +9,10 @@ class CGM(BasicNewsRecipe):
    category       = 'music'
    language       = 'pl'
    use_embedded_content   = False
    remove_empty_feeds= True
    max_articles_per_feed = 100
    no_stylesheers=True
-    extra_css      = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;}'
+    extra_css      = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}'
    remove_tags_before=dict(id='mainContent')
    remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
    remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
@ -22,10 +23,12 @@ class CGM(BasicNewsRecipe):
    def preprocess_html(self, soup):
-        ad=soup.findAll('img')
+        for item in soup.findAll(style=True):
            del item['style']
        ad=soup.findAll('a')
        for r in ad:
-            if '/_vault/_article_photos/5841.jpg' in r['src'] or '_vault/_article_photos/5807.jpg' in r['src'] or 'article_photos/5841.jpg' in r['src'] or 'article_photos/5825.jpg' in r['src'] or '_article_photos/5920.jpg' in r['src']  or '_article_photos/5919.jpg' in r['src'] or '_article_photos/5918.jpg' in r['src'] or '_article_photos/5914.jpg' in r['src'] or '_article_photos/5911.jpg' in r['src'] or '_article_photos/5923.jpg' in r['src'] or '_article_photos/5921.jpg' in r['src']:                
+            if 'http://www.hustla.pl' in r['href']:                
-                 ad[ad.index(r)].extract()
+                 r.extract()
        gallery=soup.find('div', attrs={'class':'galleryFlash'})
        if gallery:
            img=gallery.find('embed')
--- a/recipes/gram_pl.recipe
+++ b/recipes/gram_pl.recipe
@ -9,8 +9,17 @@ class Gram_pl(BasicNewsRecipe):
    oldest_article = 8
    max_articles_per_feed = 100
    no_stylesheets= True
    extra_css = 'h2 {font-style: italic;  font-size:20px;}'
    cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
    remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
    keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
    feeds          = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'),
 	      (u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
    def parse_feeds (self): 
      feeds = BasicNewsRecipe.parse_feeds(self) 
      for feed in feeds:
        for article in feed.articles[:]:
          if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
            feed.articles.remove(article)
      return feeds
--- a/recipes/tablety_pl.recipe
+++ b/recipes/tablety_pl.recipe
@ -9,4 +9,6 @@ class Tablety_pl(BasicNewsRecipe):
    language       = 'pl'
    oldest_article = 8
    max_articles_per_feed = 100
    keep_only_tags=[dict(name='header', attrs={'class':'entry-header'}), dict(name='div', attrs={'class':'entry-content clearfix'})]
    remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'}), dict(name='span', attrs={'class':'dsq-postid'})]
    feeds          = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]