diff --git a/recipes/cgm_pl.recipe b/recipes/cgm_pl.recipe index eba856ac3a..485cf45245 100644 --- a/recipes/cgm_pl.recipe +++ b/recipes/cgm_pl.recipe @@ -9,9 +9,10 @@ class CGM(BasicNewsRecipe): category = 'music' language = 'pl' use_embedded_content = False + remove_empty_feeds= True max_articles_per_feed = 100 no_stylesheers=True - extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;}' + extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}' remove_tags_before=dict(id='mainContent') remove_tags_after=dict(name='div', attrs={'class':'fbContainer'}) remove_tags=[dict(name='div', attrs={'class':'fbContainer'}), @@ -22,10 +23,12 @@ class CGM(BasicNewsRecipe): def preprocess_html(self, soup): - ad=soup.findAll('img') + for item in soup.findAll(style=True): + del item['style'] + ad=soup.findAll('a') for r in ad: - if '/_vault/_article_photos/5841.jpg' in r['src'] or '_vault/_article_photos/5807.jpg' in r['src'] or 'article_photos/5841.jpg' in r['src'] or 'article_photos/5825.jpg' in r['src'] or '_article_photos/5920.jpg' in r['src'] or '_article_photos/5919.jpg' in r['src'] or '_article_photos/5918.jpg' in r['src'] or '_article_photos/5914.jpg' in r['src'] or '_article_photos/5911.jpg' in r['src'] or '_article_photos/5923.jpg' in r['src'] or '_article_photos/5921.jpg' in r['src']: - ad[ad.index(r)].extract() + if 'http://www.hustla.pl' in r['href']: + r.extract() gallery=soup.find('div', attrs={'class':'galleryFlash'}) if gallery: img=gallery.find('embed') diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index 091c0bb1dc..c8655dc9cd 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -9,8 +9,17 @@ class Gram_pl(BasicNewsRecipe): oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True + extra_css = 'h2 {font-style: italic; font-size:20px;}' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})] feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'), (u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')] + + def parse_feeds (self): + feeds = BasicNewsRecipe.parse_feeds(self) + for feed in feeds: + for article in feed.articles[:]: + if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper(): + feed.articles.remove(article) + return feeds diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index 08212fbc66..af317d1b09 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -9,4 +9,6 @@ class Tablety_pl(BasicNewsRecipe): language = 'pl' oldest_article = 8 max_articles_per_feed = 100 + keep_only_tags=[dict(name='header', attrs={'class':'entry-header'}), dict(name='div', attrs={'class':'entry-content clearfix'})] + remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'}), dict(name='span', attrs={'class':'dsq-postid'})] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]