calibre/recipes/dzieje_pl.recipe
2019-03-22 23:47:49 +05:30

89 lines
4.0 KiB
Plaintext

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Comment
class Dzieje(BasicNewsRecipe):
title = u'dzieje.pl'
__author__ = 'fenuks'
description = 'Dzieje.pl - najlepszy portal informacyjno-edukacyjny dotyczący historii Polski XX wieku. Archiwalne fotografie, filmy, katalog postaci, quizy i konkursy.' # noqa
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history'
language = 'pl'
ignore_duplicate_articles = {'title', 'url'}
extra_css = '.imagecache-default {float:left; margin-right:20px;}'
index = 'http://dzieje.pl'
oldest_article = 8
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
keep_only_tags = [
dict(name='h1', attrs={'class': 'title'}), dict(id='content-area')]
remove_tags = [dict(attrs={'class': 'field field-type-computed field-field-tagi'}),
dict(id='dogory'), dict(name='blockquote')]
def append_page(self, soup, appendtag):
tag = appendtag.find('li', attrs={'class': 'pager-next'})
if tag:
while tag:
url = tag.a['href']
if not url.startswith('http'):
url = 'http://dzieje.pl' + tag.a['href']
soup2 = self.index_to_soup(url)
pagetext = soup2.find(
id='content-area').find(attrs={'class': 'content'})
for r in pagetext.findAll(attrs={'class': ['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}): # noqa
r.extract()
comments = pagetext.findAll(
text=lambda text: isinstance(text, Comment))
# appendtag.insert(pos, pagetext)
tag = soup2.find('li', attrs={'class': 'pager-next'})
for r in appendtag.findAll(attrs={'class': ['item-list', 'field field-type-computed field-field-tagi', ]}):
r.extract()
comments = appendtag.findAll(
text=lambda text: isinstance(text, Comment))
for comment in comments:
comment.extract()
def find_articles(self, url):
articles = []
soup = self.index_to_soup(url)
tag = soup.find(id='content-area').div.div
for i in tag.findAll('div', recursive=False):
temp = i.find(attrs={'class': 'views-field-title'}).span.a
title = temp.string
url = self.index + temp['href']
# i.find(attrs={'class':'views-field-created'}).span.string
date = ''
articles.append({'title': title,
'url': url,
'date': date,
'description': ''
})
return articles
def parse_index(self):
feeds = []
feeds.append((u"Wiadomości", self.find_articles(
'http://dzieje.pl/wiadomosci')))
feeds.append((u"Kultura i sztuka", self.find_articles(
'http://dzieje.pl/kulturaisztuka')))
feeds.append((u"Film", self.find_articles('http://dzieje.pl/kino')))
feeds.append((u"Rozmaitości historyczne",
self.find_articles('http://dzieje.pl/rozmaitości')))
feeds.append(
(u"Książka", self.find_articles('http://dzieje.pl/ksiazka')))
feeds.append(
(u"Wystawa", self.find_articles('http://dzieje.pl/wystawa')))
feeds.append((u"Edukacja", self.find_articles(
'http://dzieje.pl/edukacja')))
feeds.append((u"Dzieje się", self.find_articles(
'http://dzieje.pl/wydarzenia')))
return feeds
def preprocess_html(self, soup):
for a in soup('a', href=True):
if not a['href'].startswith('http'):
a['href'] = self.index + a['href']
self.append_page(soup, soup.body)
return soup