from calibre.web.feeds.news import BasicNewsRecipe import re from calibre.ebooks.BeautifulSoup import Comment class BenchmarkPl(BasicNewsRecipe): title = u'Benchmark.pl' __author__ = 'fenuks' description = u'benchmark.pl, recenzje i testy sprzętu, aktualności, rankingi, sterowniki, porady, opinie' masthead_url = 'http://www.benchmark.pl/i/logo-footer.png' cover_url = 'http://www.benchmark.pl/i/logo-dark.png' category = 'IT' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 extra_css = 'ul {list-style-type: none;}' no_stylesheets = True use_embedded_content = False preprocess_regexps = [(re.compile(u'

 Zobacz poprzednie Opinie dnia:.*', # noqa re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(u'Więcej o .*?', re.DOTALL | re.IGNORECASE), lambda match: '')] # noqa keep_only_tags = [dict(id=['articleHeader', 'articleGallery']), dict( name='div', attrs={'class': ['m_zwykly', 'gallery']}), dict(id='article')] remove_tags_after = dict(id='article') remove_tags = [dict(name='div', attrs={'class': ['comments', 'body', 'kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery', 'breadcrumb', 'footer', 'moreTopics']}), dict(name='table', attrs={ 'background': 'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width': '210', 'cellspacing': '1', 'cellpadding': '4', 'border': '0', 'align': 'right'})] # noqa INDEX = 'http://www.benchmark.pl' feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'), (u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')] def append_page(self, soup, appendtag): nexturl = soup.find(attrs={'class': 'next'}) while nexturl: soup2 = self.index_to_soup(nexturl['href']) nexturl = soup2.find(attrs={'class': 'next'}) pagetext = soup2.find(name='div', attrs={'class': 'body'}) tag = appendtag.find('div', attrs={'class': 'k_ster'}) if tag: tag.extract() comments = pagetext.findAll( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) if appendtag.find('div', attrs={'class': 'k_ster'}): appendtag.find('div', attrs={'class': 'k_ster'}).extract() for r in appendtag.findAll(attrs={'class': 'changePage'}): r.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) for a in soup.findAll('a', href=True): if not a['href'].startswith('http'): a['href'] = self.INDEX + a['href'] for r in soup.findAll(attrs={'class': ['comments', 'body']}): r.extract() tag1 = soup.find(attrs={'class': 'inlineGallery'}) if tag1: for tag in tag1.findAll('li'): tag['style'] = 'float: left; margin-right: 10px;' tag1.findNext('p')['style'] = 'clear: both;' return soup