from __future__ import print_function import time from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment class GryOnlinePl(BasicNewsRecipe): title = u'Gry-Online.pl' __author__ = 'fenuks' description = u'WiadomoĊ›ci o grach, recenzje, zapowiedzi. Encyklopedia Gier zawiera opisy gier na PC, konsole Xbox360, PS3 i inne platformy.' category = 'games' language = 'pl' oldest_article = 13 INDEX = 'http://www.gry-online.pl/' masthead_url = 'http://www.gry-online.pl/im/gry-online-logo.png' cover_url = 'http://www.gry-online.pl/im/gry-online-logo.png' max_articles_per_feed = 100 no_stylesheets = True keep_only_tags = [dict(name='div', attrs={'class': [ 'gc660', 'gc660 S013', 'news_endpage_tit', 'news_container', 'news']})] remove_tags = [ dict({'class': ['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2', 'social-for-old-news', 'social-for-old-rec']})] # noqa feeds = [ (u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')] def append_page(self, soup, appendtag): tag = appendtag.find('div', attrs={'class': 'n5p'}) if tag: nexturls = tag.findAll('a') url_part = soup.find('link', attrs={'rel': 'canonical'})['href'] url_part = url_part[25:].rpartition('?')[0] for nexturl in nexturls[1:-1]: finalurl = 'http://www.gry-online.pl/' + \ url_part + nexturl['href'] for i in range(10): try: soup2 = self.index_to_soup(finalurl) break except: print('retrying in 0.5s') time.sleep(0.5) pagetext = soup2.find(attrs={'class': 'gc660'}) for r in pagetext.findAll(name='header'): r.extract() for r in pagetext.findAll(attrs={'itemprop': 'description'}): r.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class': ['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'imh10b']}): r.extract() comments = appendtag.findAll( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() else: tag = appendtag.find('div', attrs={'class': 'S018stronyr'}) if tag: nexturl = tag.a url_part = soup.find( 'link', attrs={'rel': 'canonical'})['href'] url_part = url_part[25:].rpartition('?')[0] while tag: end = tag.find(attrs={'class': 'right left-dead'}) if end: break else: nexturl = tag.a finalurl = 'http://www.gry-online.pl/' + \ url_part + nexturl['href'] for i in range(10): try: soup2 = self.index_to_soup(finalurl) break except: print('retrying in 0.5s') time.sleep(0.5) tag = soup2.find('div', attrs={'class': 'S018stronyr'}) pagetext = soup2.find(attrs={'class': 'gc660'}) for r in pagetext.findAll(name='header'): r.extract() for r in pagetext.findAll(attrs={'itemprop': 'description'}): r.extract() comments = pagetext.findAll( text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] pos = len(appendtag.contents) appendtag.insert(pos, pagetext) for r in appendtag.findAll(attrs={'class': ['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony', 'imh10b']}): r.extract() comments = appendtag.findAll( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() def image_url_processor(self, baseurl, url): if url.startswith('..'): return url[2:] else: return url def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup