from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment class CoNowegoPl(BasicNewsRecipe): title = u'conowego.pl' __author__ = 'fenuks' description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !' #cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png' category = 'IT, news' language = 'pl' oldest_article = 7 max_articles_per_feed = 100 INDEX = 'http://www.conowego.pl/' extra_css = '.news-single-img {float:left; margin-right:5px;}' no_stylesheets = True remove_empty_feeds = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [dict(name='div', attrs={'class':'news_list single_view'})] remove_tags = [dict(name='div', attrs={'class':['ni_bottom', 'ni_rank', 'ni_date']})] feeds = [(u'Aktualno\u015bci', u'http://www.conowego.pl/rss/aktualnosci-5/?type=100'), (u'Gaming', u'http://www.conowego.pl/rss/gaming-6/?type=100'), (u'Porady', u'http://www.conowego.pl/rss/porady-3/?type=100'), (u'Testy', u'http://www.conowego.pl/rss/testy-2/?type=100')] def preprocess_html(self, soup): for i in soup.findAll('img'): i.parent.insert(0, BeautifulSoup('
')) i.insert(len(i), BeautifulSoup('
')) self.append_page(soup, soup.body) return soup def append_page(self, soup, appendtag): tag = appendtag.find('div', attrs={'class':'pages'}) if tag: nexturls=tag.findAll('a') for nexturl in nexturls[:-1]: soup2 = self.index_to_soup('http://www.conowego.pl/' + nexturl['href']) pagetext = soup2.find(attrs={'class':'ni_content'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) comments = appendtag.findAll(text=lambda text:isinstance(text, Comment)) for comment in comments: comment.extract() for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}): r.extract() def get_cover_url(self): soup = self.index_to_soup('http://www.conowego.pl/magazyn/') tag = soup.find(attrs={'class':'ms_left'}) if tag: self.cover_url = self.INDEX + tag.find('img')['src'] return getattr(self, 'cover_url', self.cover_url)