from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment class CoNowegoPl(BasicNewsRecipe): title = u'conowego.pl' __author__ = 'fenuks' description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !' # noqa category = 'IT, news' language = 'pl' oldest_article = 7 max_articles_per_feed = 100 INDEX = 'http://www.conowego.pl/' extra_css = '.news-single-img {float:left; margin-right:5px;}' no_stylesheets = True remove_empty_feeds = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [ dict(name='div', attrs={'class': 'news_list single_view'})] remove_tags = [ dict(name='div', attrs={'class': ['ni_bottom', 'ni_rank', 'ni_date']})] feeds = [(u'Aktualno\u015bci', u'http://www.conowego.pl/rss/aktualnosci-5/?type=100'), (u'Gaming', u'http://www.conowego.pl/rss/gaming-6/?type=100'), (u'Porady', u'http://www.conowego.pl/rss/porady-3/?type=100'), (u'Testy', u'http://www.conowego.pl/rss/testy-2/?type=100')] def preprocess_html(self, soup): for i in soup.findAll('img'): i.parent.insert(0, BeautifulSoup('
')) i.insert(len(i), BeautifulSoup('
')) self.append_page(soup, soup.body) return soup def append_page(self, soup, appendtag): tag = appendtag.find('div', attrs={'class': 'pages'}) if tag: nexturls = tag.findAll('a') for nexturl in nexturls[:-1]: soup2 = self.index_to_soup( 'http://www.conowego.pl/' + nexturl['href']) pagetext = soup2.find(attrs={'class': 'ni_content'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) comments = appendtag.findAll( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() for r in appendtag.findAll(attrs={'class': ['pages', 'paginationWrap']}): r.extract() def get_cover_url(self): soup = self.index_to_soup('http://www.conowego.pl/magazyn/') tag = soup.find(attrs={'class': 'ms_left'}) if tag: self.cover_url = self.INDEX + tag.find('img')['src'] return getattr(self, 'cover_url', self.cover_url)