from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment class Dzieje(BasicNewsRecipe): title = u'dzieje.pl' __author__ = 'fenuks' description = 'Dzieje.pl - najlepszy portal informacyjno-edukacyjny dotyczący historii Polski XX wieku. Archiwalne fotografie, filmy, katalog postaci, quizy i konkursy.' # noqa cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png' category = 'history' language = 'pl' ignore_duplicate_articles = {'title', 'url'} extra_css = '.imagecache-default {float:left; margin-right:20px;}' index = 'http://dzieje.pl' oldest_article = 8 max_articles_per_feed = 100 remove_javascript = True no_stylesheets = True keep_only_tags = [ dict(name='h1', attrs={'class': 'title'}), dict(id='content-area')] remove_tags = [dict(attrs={'class': 'field field-type-computed field-field-tagi'}), dict(id='dogory'), dict(name='blockquote')] def append_page(self, soup, appendtag): tag = appendtag.find('li', attrs={'class': 'pager-next'}) if tag: while tag: url = tag.a['href'] if not url.startswith('http'): url = 'http://dzieje.pl' + tag.a['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find( id='content-area').find(attrs={'class': 'content'}) for r in pagetext.findAll(attrs={'class': ['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}): # noqa r.extract() comments = pagetext.findAll( text=lambda text: isinstance(text, Comment)) # appendtag.insert(pos, pagetext) tag = soup2.find('li', attrs={'class': 'pager-next'}) for r in appendtag.findAll(attrs={'class': ['item-list', 'field field-type-computed field-field-tagi', ]}): r.extract() comments = appendtag.findAll( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() def find_articles(self, url): articles = [] soup = self.index_to_soup(url) tag = soup.find(id='content-area').div.div for i in tag.findAll('div', recursive=False): temp = i.find(attrs={'class': 'views-field-title'}).span.a title = temp.string url = self.index + temp['href'] # i.find(attrs={'class':'views-field-created'}).span.string date = '' articles.append({'title': title, 'url': url, 'date': date, 'description': '' }) return articles def parse_index(self): feeds = [] feeds.append((u"Wiadomości", self.find_articles( 'http://dzieje.pl/wiadomosci'))) feeds.append((u"Kultura i sztuka", self.find_articles( 'http://dzieje.pl/kulturaisztuka'))) feeds.append((u"Film", self.find_articles('http://dzieje.pl/kino'))) feeds.append((u"Rozmaitości historyczne", self.find_articles('http://dzieje.pl/rozmaitości'))) feeds.append( (u"Książka", self.find_articles('http://dzieje.pl/ksiazka'))) feeds.append( (u"Wystawa", self.find_articles('http://dzieje.pl/wystawa'))) feeds.append((u"Edukacja", self.find_articles( 'http://dzieje.pl/edukacja'))) feeds.append((u"Dzieje się", self.find_articles( 'http://dzieje.pl/wydarzenia'))) return feeds def preprocess_html(self, soup): for a in soup('a', href=True): if not a['href'].startswith('http'): a['href'] = self.index + a['href'] self.append_page(soup, soup.body) return soup