mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
83 lines
2.8 KiB
Python
83 lines
2.8 KiB
Python
#!/usr/bin/env python2
|
|
import re
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import Comment
|
|
|
|
|
|
class PCLab(BasicNewsRecipe):
|
|
cover_url = 'http://pclab.pl/img/logo.png'
|
|
title = u"PC Lab"
|
|
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
|
description = u"Articles from PC Lab website"
|
|
language = 'pl'
|
|
oldest_article = 30
|
|
max_articles_per_feed = 100
|
|
recursions = 0
|
|
encoding = 'iso-8859-2'
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
remove_empty_feeds = True
|
|
use_embedded_content = False
|
|
|
|
keep_only_tags = [
|
|
dict(name='div', attrs={'class': ['substance']})
|
|
]
|
|
|
|
remove_tags = [
|
|
dict(name='div', attrs={'class': [
|
|
'toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']})
|
|
]
|
|
|
|
# links to RSS feeds
|
|
feeds = [
|
|
(u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'),
|
|
(u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'),
|
|
(u'Poradniki', 'http://pclab.pl/xml/poradniki.xml')
|
|
]
|
|
|
|
# load second and subsequent page content
|
|
# in: soup - full page with 'next' button
|
|
# out: appendtag - tag to which new page is to be added
|
|
def append_page(self, soup, appendtag):
|
|
# find the 'Next' button
|
|
pager = soup.find('div', attrs={'class': 'navigation'})
|
|
if pager:
|
|
a = pager.find('a')
|
|
if 'news' in a['href']:
|
|
pager = None
|
|
else:
|
|
pager = pager.find('div', attrs={'class': 'next'})
|
|
|
|
while pager:
|
|
# search for 'a' element with link to next page (exit if not found)
|
|
a = pager.find('a')
|
|
nexturl = a['href']
|
|
soup2 = self.index_to_soup('http://pclab.pl' + nexturl)
|
|
pager = soup2.find('div', attrs={'class': 'next'})
|
|
pagetext = soup2.find('div', attrs={'class': 'substance'})
|
|
pagetext = pagetext.find('div', attrs={'class': 'data'})
|
|
comments = pagetext.findAll(
|
|
text=lambda text: isinstance(text, Comment))
|
|
for comment in comments:
|
|
comment.extract()
|
|
|
|
pos = len(appendtag.contents)
|
|
appendtag.insert(pos, pagetext)
|
|
pos = len(appendtag.contents)
|
|
|
|
pager = soup.find('div', attrs={'class': 'navigation'})
|
|
if pager:
|
|
pager.extract()
|
|
|
|
def preprocess_html(self, soup):
|
|
# soup.body contains no title and no navigator, they are in soup
|
|
self.append_page(soup, soup.body)
|
|
for link in soup.findAll('a'):
|
|
href = link.get('href', None)
|
|
if href and href.startswith('/'):
|
|
link['href'] = 'http://pclab.pl' + href
|
|
for r in soup.findAll(name='a', href=re.compile(r'^https://www.skapiec.pl/')):
|
|
r.extract()
|
|
|
|
return soup
|