This commit is contained in:
Kovid Goyal 2011-10-02 08:24:11 -06:00
parent c987d56c8e
commit 66ea17fe86
3 changed files with 18 additions and 4 deletions

View File

@ -9,9 +9,10 @@ class CGM(BasicNewsRecipe):
category = 'music' category = 'music'
language = 'pl' language = 'pl'
use_embedded_content = False use_embedded_content = False
remove_empty_feeds= True
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheers=True no_stylesheers=True
extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;}' extra_css = 'div {color:black;} strong {color:black;} span {color:black;} p {color:black;} h2 {color:black;}'
remove_tags_before=dict(id='mainContent') remove_tags_before=dict(id='mainContent')
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'}) remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
remove_tags=[dict(name='div', attrs={'class':'fbContainer'}), remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
@ -22,10 +23,12 @@ class CGM(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
ad=soup.findAll('img') for item in soup.findAll(style=True):
del item['style']
ad=soup.findAll('a')
for r in ad: for r in ad:
if '/_vault/_article_photos/5841.jpg' in r['src'] or '_vault/_article_photos/5807.jpg' in r['src'] or 'article_photos/5841.jpg' in r['src'] or 'article_photos/5825.jpg' in r['src'] or '_article_photos/5920.jpg' in r['src'] or '_article_photos/5919.jpg' in r['src'] or '_article_photos/5918.jpg' in r['src'] or '_article_photos/5914.jpg' in r['src'] or '_article_photos/5911.jpg' in r['src'] or '_article_photos/5923.jpg' in r['src'] or '_article_photos/5921.jpg' in r['src']: if 'http://www.hustla.pl' in r['href']:
ad[ad.index(r)].extract() r.extract()
gallery=soup.find('div', attrs={'class':'galleryFlash'}) gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery: if gallery:
img=gallery.find('embed') img=gallery.find('embed')

View File

@ -9,8 +9,17 @@ class Gram_pl(BasicNewsRecipe):
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets= True no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})] keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'), feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'),
(u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')] (u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
feed.articles.remove(article)
return feeds

View File

@ -9,4 +9,6 @@ class Tablety_pl(BasicNewsRecipe):
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
keep_only_tags=[dict(name='header', attrs={'class':'entry-header'}), dict(name='div', attrs={'class':'entry-content clearfix'})]
remove_tags=[dict(name='div', attrs={'class':'snap_nopreview sharing robots-nocontent'}), dict(name='span', attrs={'class':'dsq-postid'})]
feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')]