#!/usr/bin/env python2 import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment class PCLab(BasicNewsRecipe): cover_url = 'http://pclab.pl/img/logo.png' title = u"PC Lab" __author__ = 'ravcio - rlelusz[at]gmail.com' description = u"Articles from PC Lab website" language = 'pl' oldest_article = 30 max_articles_per_feed = 100 recursions = 0 encoding = 'iso-8859-2' no_stylesheets = True remove_javascript = True remove_empty_feeds = True use_embedded_content = False keep_only_tags = [ dict(name='div', attrs={'class': ['substance']}) ] remove_tags = [ dict(name='div', attrs={'class': [ 'toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']}) ] # links to RSS feeds feeds = [ (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'), (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'), (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml') ] # load second and subsequent page content # in: soup - full page with 'next' button # out: appendtag - tag to which new page is to be added def append_page(self, soup, appendtag): # find the 'Next' button pager = soup.find('div', attrs={'class': 'navigation'}) if pager: a = pager.find('a') if 'news' in a['href']: pager = None else: pager = pager.find('div', attrs={'class': 'next'}) while pager: # search for 'a' element with link to next page (exit if not found) a = pager.find('a') nexturl = a['href'] soup2 = self.index_to_soup('http://pclab.pl' + nexturl) pager = soup2.find('div', attrs={'class': 'next'}) pagetext = soup2.find('div', attrs={'class': 'substance'}) pagetext = pagetext.find('div', attrs={'class': 'data'}) comments = pagetext.findAll( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) pos = len(appendtag.contents) pager = soup.find('div', attrs={'class': 'navigation'}) if pager: pager.extract() def preprocess_html(self, soup): # soup.body contains no title and no navigator, they are in soup self.append_page(soup, soup.body) for link in soup.findAll('a'): href = link.get('href', None) if href and href.startswith('/'): link['href'] = 'http://pclab.pl' + href for r in soup.findAll(name='a', href=re.compile(r'^https://www.skapiec.pl/')): r.extract() return soup