calibre/recipes/ceska_pozice.recipe
2012-11-18 23:53:33 +05:30

69 lines
2.6 KiB
Plaintext

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class ceskaPoziceRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Česká pozice'
description = 'Česká pozice'
oldest_article = 2
max_articles_per_feed = 20
feeds = [
(u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'),
(u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'),
(u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'),
(u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed')
]
language = 'cs'
cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png'
remove_javascript = True
no_stylesheets = True
domain = u'http://www.ceskapozice.cz'
use_embedded_content = False
remove_tags = [dict(name='div', attrs={'class':['block-ad', 'region region-content-ad']}),
dict(name='ul', attrs={'class':'links'}),
dict(name='div', attrs={'id':['comments', 'back-to-top']}),
dict(name='div', attrs={'class':['next-page', 'region region-content-ad']}),
dict(name='cite')]
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:
self.log.debug('Ignoring duplicate: ' + url)
return None
else:
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return url
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
return soup
def append_page(self, soup, appendtag, position):
pager = soup.find('div', attrs={'class':'paging-bottom'})
if pager:
nextbutton = pager.find('li', attrs={'class':'pager-next'})
if nextbutton:
nexturl = self.domain + nextbutton.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'main-body'})
for it in texttag.findAll('div', attrs={'class':'region region-content-ad'}):
it.extract()
for it in texttag.findAll('cite'):
it.extract()
newpos = len(texttag.contents)
self.append_page(soup2, texttag, newpos)
texttag.extract()
appendtag.insert(position, texttag)
pager.extract()