fix computerworld.pl recipe

This commit is contained in:
Tomasz Długosz 2017-10-27 23:02:29 +02:00
parent 8bfeac7440
commit c6d33d0add

View File

@ -14,19 +14,13 @@ class Computerworld_pl(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_attributes = ['style', ]
use_embedded_content = False use_embedded_content = False
preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''), preprocess_regexps = [(re.compile(u'Zobacz również:', re.IGNORECASE), lambda m: ''),
(re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''), ] (re.compile(ur'[*]+reklama[*]+', re.IGNORECASE), lambda m: ''), ]
keep_only_tags = [dict(id=['article-default-body'])] keep_only_tags = [dict(name='article')]
remove_tags = [dict(attrs={'class': ['share_tools nocontent', 'rec']}), dict( remove_tags = [dict(attrs={'class': ['share_tools nocontent', 'rec']}),
id=['topComment', 'bottom_tools'])] dict(name='ul',attrs={'class':'tags'}),
dict(name='ol'),
dict(id=['topComment', 'bottom_tools'])]
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')] feeds = [(u'Wiadomo\u015bci', u'https://www.computerworld.pl/news?rss')]
def skip_ad_pages(self, soup):
if soup.title.string.lower() == 'advertisement':
tag = soup.find(name='a')
if tag:
new_soup = self.index_to_soup(tag['href'], raw=True)
return new_soup