from calibre.web.feeds.news import BasicNewsRecipe import re from calibre.ebooks.BeautifulSoup import Comment class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl, recenzje i testy sprzętu, aktualności, rankingi, sterowniki, porady, opinie' masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' cover_url = 'http://www.benchmark.pl/i/logo-dark.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 extra_css = 'ul {list-style-type: none;}' no_stylesheets = True use_embedded_content = False #remove_attributes = ['style'] preprocess_regexps = [(re.compile(ur'

 Zobacz poprzednie Opinie dnia:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Więcej o .*?', re.DOTALL|re.IGNORECASE), lambda match: '')] keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')] remove_tags_after = dict(id='article') remove_tags = [dict(name='div', attrs={'class':['comments', 'body', 'kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs = {'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})] INDEX = 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] def append_page(self, soup, appendtag): nexturl = soup.find(attrs={'class':'next'}) while nexturl: soup2 = self.index_to_soup(nexturl['href']) nexturl = soup2.find(attrs={'class':'next'}) pagetext = soup2.find(name='div', attrs={'class':'body'}) tag = appendtag.find('div', attrs={'class':'k_ster'}) if tag: tag.extract() comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) for comment in comments: comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) if appendtag.find('div', attrs={'class':'k_ster'}): appendtag.find('div', attrs={'class':'k_ster'}).extract() for r in appendtag.findAll(attrs={'class':'changePage'}): r.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) for a in soup('a'): if a.has_key('href') and not a['href'].startswith('http'): a['href'] = self.INDEX + a['href'] for r in soup.findAll(attrs={'class':['comments', 'body']}): r.extract() tag1 = soup.find(attrs={'class':'inlineGallery'}) if tag1: for tag in tag1.findAll('li'): tag['style'] = 'float: left; margin-right: 10px;' tag1.findNext('p')['style'] = 'clear: both;' return soup