from calibre.web.feeds.news import BasicNewsRecipe class PC_Arena(BasicNewsRecipe): title = u'PCArena' oldest_article = 18300 max_articles_per_feed = 100 __author__ = 'fenuks' description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' category = 'IT' language = 'pl' masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' no_stylesheets = True keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] remove_tags=[dict(attrs={'class':'pages'})] feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] def append_page(self, soup, appendtag): tag=soup.find(name='div', attrs={'class':'pagNum'}) if tag: nexturl=tag.findAll('a') tag.extract() for nextpage in nexturl[1:]: nextpage= 'http://pcarena.pl' + nextpage['href'] soup2 = self.index_to_soup(nextpage) pagetext = soup2.find(attrs={'class':'artBody'}) pos = len(appendtag.contents) appendtag.insert(pos, pagetext) def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup