from calibre.web.feeds.news import BasicNewsRecipe import re class Benchmark_pl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl -IT site' cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets=True preprocess_regexps = [(re.compile(ur'\bWięcej o .*', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})] remove_tags_after=dict(name='div', attrs={'class':'body'}) remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})] INDEX= 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] def append_page(self, soup, appendtag): nexturl = soup.find('span', attrs={'class':'next'}) while nexturl is not None: nexturl= self.INDEX + nexturl.parent['href'] soup2 = self.index_to_soup(nexturl) nexturl=soup2.find('span', attrs={'class':'next'}) pagetext = soup2.find(name='div', attrs={'class':'body'}) appendtag.find('div', attrs={'class':'k_ster'}).extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) if appendtag.find('div', attrs={'class':'k_ster'}) is not None: appendtag.find('div', attrs={'class':'k_ster'}).extract() def image_article(self, soup, appendtag): nexturl=soup.find('div', attrs={'class':'preview'}) if nexturl is not None: nexturl=nexturl.find('a', attrs={'class':'move_next'}) image=appendtag.find('div', attrs={'class':'preview'}).div['style'][16:] image=self.INDEX + image[:image.find("')")] appendtag.find(attrs={'class':'preview'}).name='img' appendtag.find(attrs={'class':'preview'})['src']=image appendtag.find('a', attrs={'class':'move_next'}).extract() while nexturl is not None: nexturl= self.INDEX + nexturl['href'] soup2 = self.index_to_soup(nexturl) nexturl=soup2.find('a', attrs={'class':'move_next'}) image=soup2.find('div', attrs={'class':'preview'}).div['style'][16:] image=self.INDEX + image[:image.find("')")] soup2.find(attrs={'class':'preview'}).name='img' soup2.find(attrs={'class':'preview'})['src']=image pagetext=soup2.find('div', attrs={'class':'gallery'}) pagetext.find('div', attrs={'class':'title'}).extract() pagetext.find('div', attrs={'class':'thumb'}).extract() pagetext.find('div', attrs={'class':'panelOcenaObserwowane'}).extract() if nexturl is not None: pagetext.find('a', attrs={'class':'move_next'}).extract() pagetext.find('a', attrs={'class':'move_back'}).extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) def preprocess_html(self, soup): if soup.find('div', attrs={'class':'preview'}) is not None: self.image_article(soup, soup.body) else: self.append_page(soup, soup.body) return soup