mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
89 lines
4.0 KiB
Plaintext
89 lines
4.0 KiB
Plaintext
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import Comment
|
|
|
|
|
|
class Dzieje(BasicNewsRecipe):
|
|
title = u'dzieje.pl'
|
|
__author__ = 'fenuks'
|
|
description = 'Dzieje.pl - najlepszy portal informacyjno-edukacyjny dotyczący historii Polski XX wieku. Archiwalne fotografie, filmy, katalog postaci, quizy i konkursy.' # noqa
|
|
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
|
|
category = 'history'
|
|
language = 'pl'
|
|
ignore_duplicate_articles = {'title', 'url'}
|
|
extra_css = '.imagecache-default {float:left; margin-right:20px;}'
|
|
index = 'http://dzieje.pl'
|
|
oldest_article = 8
|
|
max_articles_per_feed = 100
|
|
remove_javascript = True
|
|
no_stylesheets = True
|
|
keep_only_tags = [
|
|
dict(name='h1', attrs={'class': 'title'}), dict(id='content-area')]
|
|
remove_tags = [dict(attrs={'class': 'field field-type-computed field-field-tagi'}),
|
|
dict(id='dogory'), dict(name='blockquote')]
|
|
|
|
def append_page(self, soup, appendtag):
|
|
tag = appendtag.find('li', attrs={'class': 'pager-next'})
|
|
if tag:
|
|
while tag:
|
|
url = tag.a['href']
|
|
if not url.startswith('http'):
|
|
url = 'http://dzieje.pl' + tag.a['href']
|
|
soup2 = self.index_to_soup(url)
|
|
pagetext = soup2.find(
|
|
id='content-area').find(attrs={'class': 'content'})
|
|
for r in pagetext.findAll(attrs={'class': ['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}): # noqa
|
|
r.extract()
|
|
comments = pagetext.findAll(
|
|
text=lambda text: isinstance(text, Comment))
|
|
# appendtag.insert(pos, pagetext)
|
|
tag = soup2.find('li', attrs={'class': 'pager-next'})
|
|
for r in appendtag.findAll(attrs={'class': ['item-list', 'field field-type-computed field-field-tagi', ]}):
|
|
r.extract()
|
|
comments = appendtag.findAll(
|
|
text=lambda text: isinstance(text, Comment))
|
|
for comment in comments:
|
|
comment.extract()
|
|
|
|
def find_articles(self, url):
|
|
articles = []
|
|
soup = self.index_to_soup(url)
|
|
tag = soup.find(id='content-area').div.div
|
|
for i in tag.findAll('div', recursive=False):
|
|
temp = i.find(attrs={'class': 'views-field-title'}).span.a
|
|
title = temp.string
|
|
url = self.index + temp['href']
|
|
# i.find(attrs={'class':'views-field-created'}).span.string
|
|
date = ''
|
|
articles.append({'title': title,
|
|
'url': url,
|
|
'date': date,
|
|
'description': ''
|
|
})
|
|
return articles
|
|
|
|
def parse_index(self):
|
|
feeds = []
|
|
feeds.append((u"Wiadomości", self.find_articles(
|
|
'http://dzieje.pl/wiadomosci')))
|
|
feeds.append((u"Kultura i sztuka", self.find_articles(
|
|
'http://dzieje.pl/kulturaisztuka')))
|
|
feeds.append((u"Film", self.find_articles('http://dzieje.pl/kino')))
|
|
feeds.append((u"Rozmaitości historyczne",
|
|
self.find_articles('http://dzieje.pl/rozmaitości')))
|
|
feeds.append(
|
|
(u"Książka", self.find_articles('http://dzieje.pl/ksiazka')))
|
|
feeds.append(
|
|
(u"Wystawa", self.find_articles('http://dzieje.pl/wystawa')))
|
|
feeds.append((u"Edukacja", self.find_articles(
|
|
'http://dzieje.pl/edukacja')))
|
|
feeds.append((u"Dzieje się", self.find_articles(
|
|
'http://dzieje.pl/wydarzenia')))
|
|
return feeds
|
|
|
|
def preprocess_html(self, soup):
|
|
for a in soup('a', href=True):
|
|
if not a['href'].startswith('http'):
|
|
a['href'] = self.index + a['href']
|
|
self.append_page(soup, soup.body)
|
|
return soup
|