calibre/recipes/pc_lab.recipe
2015-01-23 19:08:21 +05:30

78 lines
3.0 KiB
Python

#!/usr/bin/env python2
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Comment
class PCLab(BasicNewsRecipe):
cover_url = 'http://pclab.pl/img/logo.png'
title = u"PC Lab"
__author__ = 'ravcio - rlelusz[at]gmail.com'
description = u"Articles from PC Lab website"
language = 'pl'
oldest_article = 30
max_articles_per_feed = 100
recursions = 0
encoding = 'iso-8859-2'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
use_embedded_content = False
keep_only_tags = [
dict(name='div', attrs={'class':['substance']})
]
remove_tags = [
dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']})
]
#links to RSS feeds
feeds = [
(u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'),
(u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'),
(u'Poradniki', 'http://pclab.pl/xml/poradniki.xml')
]
#load second and subsequent page content
# in: soup - full page with 'next' button
# out: appendtag - tag to which new page is to be added
def append_page(self, soup, appendtag):
# find the 'Next' button
pager = soup.find('div', attrs={'class':'navigation'})
if pager:
a = pager.find('a')
if 'news' in a['href']:
pager = None
else:
pager = pager.find('div', attrs={'class':'next'})
while pager:
#search for 'a' element with link to next page (exit if not found)
a = pager.find('a')
nexturl = a['href']
soup2 = self.index_to_soup('http://pclab.pl' + nexturl)
pager = soup2.find('div', attrs={'class':'next'})
pagetext = soup2.find('div', attrs={'class':'substance'})
pagetext = pagetext.find('div', attrs={'class':'data'})
comments = pagetext.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pos = len(appendtag.contents)
pager = soup.find('div', attrs={'class':'navigation'})
if pager:
pager.extract()
def preprocess_html(self, soup):
# soup.body contains no title and no navigator, they are in soup
self.append_page(soup, soup.body)
for link in soup.findAll('a'):
href = link.get('href', None)
if href and href.startswith('/'):
link['href'] = 'http://pclab.pl' + href
# finally remove some tags
#for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
return soup