calibre/recipes/ceska_pozice.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

70 lines
2.7 KiB
Plaintext

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
class ceskaPoziceRecipe(BasicNewsRecipe):
__author__ = 'bubak'
title = u'Česká pozice'
description = 'Česká pozice'
oldest_article = 2
max_articles_per_feed = 20
feeds = [
(u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'),
(u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'),
(u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'),
(u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed')
]
language = 'cs'
cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png'
remove_javascript = True
no_stylesheets = True
domain = u'http://www.ceskapozice.cz'
use_embedded_content = False
remove_tags = [dict(name='div', attrs={'class': ['block-ad', 'region region-content-ad']}),
dict(name='ul', attrs={'class': 'links'}),
dict(name='div', attrs={
'id': ['comments', 'back-to-top']}),
dict(name='div', attrs={
'class': ['next-page', 'region region-content-ad']}),
dict(name='cite')]
keep_only_tags = [dict(name='div', attrs={'id': 'content'})]
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:
self.log.debug('Ignoring duplicate: ' + url)
return None
else:
self.visited_urls[url] = True
self.log.debug('Accepting: ' + url)
return url
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
return soup
def append_page(self, soup, appendtag, position):
pager = soup.find('div', attrs={'class': 'paging-bottom'})
if pager:
nextbutton = pager.find('li', attrs={'class': 'pager-next'})
if nextbutton:
nexturl = self.domain + nextbutton.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class': 'main-body'})
for it in texttag.findAll('div', attrs={'class': 'region region-content-ad'}):
it.extract()
for it in texttag.findAll('cite'):
it.extract()
newpos = len(texttag.contents)
self.append_page(soup2, texttag, newpos)
texttag.extract()
appendtag.insert(position, texttag)
pager.extract()