__license__ = 'GPL v3' import re import datetime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Comment class Odkrywcy(BasicNewsRecipe): title = u'Odkrywcy.pl' __author__ = 'fenuks' description = u'' language = 'pl' extra_css = 'img {display: block;}' cover_url = '' INDEX = 'http://odkrywcy.pl' use_embedded_content = False oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True remove_javascript = True remove_attributes = ['style', 'font'] ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [dict(attrs={'class': 'content'})] remove_tags = [ dict(name='a', attrs={'href': ['#opOpinie', '#opinie']}), dict(attrs={'class': ['fr', 'clra', 'close', 'wpsocial-fbFanpageBox', 'tagi', 'test']}), dict(id=['rekSrd05', 'moreTopNews']), dict(name='img', attrs={'class': 'zr'}), dict(name='img', attrs={'alt': u'Następne'})] remove_tags_after = dict(id='aTxt') feeds = [(u'', '')] def find_articles(self, url): articles = [] soup = self.index_to_soup(url) for i in soup.findAll(attrs={'class': 'katZj clra'}): tmp = i.find('small') datestring = re.search( 'dodano: (\d{4}-\d{2}-\d{2})', tmp.string).group(1) d = datetime.datetime.strptime(datestring, "%Y-%m-%d").date() if (datetime.datetime.now().date() - d).days > self.oldest_article: continue tmp = i.find('a') title = tmp.string url = self.INDEX + tmp['href'] articles.append({'title': title, 'url': url, 'date': '', 'description': '' }) return articles def parse_index(self): feeds = [] feeds.append((u'Człowiek', self.find_articles( 'http://odkrywcy.pl/kat,111396,name,Czlowiek,kategoria.html'))) feeds.append((u'Technologie', self.find_articles( 'http://odkrywcy.pl/kat,111398,name,Technologie,kategoria.html'))) feeds.append((u'Ekologia', self.find_articles( 'http://odkrywcy.pl/kat,111400,name,Ekologia,kategoria.html'))) feeds.append((u'Kosmos', self.find_articles( 'http://odkrywcy.pl/kat,111402,name,Kosmos,kategoria.html'))) feeds.append((u'Cywilizacja', self.find_articles( 'http://odkrywcy.pl/kat,111404,name,Cywilizacja,kategoria.html'))) feeds.append((u'Przyroda', self.find_articles( 'http://odkrywcy.pl/kat,111406,name,Przyroda,kategoria.html'))) feeds.append((u'Fizyka i chemia', self.find_articles( 'http://odkrywcy.pl/kat,111408,name,Fizyka,kategoria.html'))) feeds.append((u'Historia', self.find_articles( 'http://odkrywcy.pl/kat,122994,name,Historia,kategoria.html'))) feeds.append((u'Media', self.find_articles( 'http://odkrywcy.pl/kat,116794,name,Media,media.html'))) return feeds def append_page(self, soup, appendtag): tag = soup.find('a', attrs={'class': 'btnNext'}) urls = [] while tag is not None: if tag['href'] in urls: break urls.append(tag['href']) soup2 = self.index_to_soup(self.INDEX + tag['href']) tag = soup2.find(name='a', attrs={'class': 'btnNext'}) pagetext = soup2.findAll(attrs={'class': 'content'}) for container in pagetext: header = container.find(name='h1') if header: header.extract() for comment in container.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() for container in pagetext: pos = len(appendtag.contents) appendtag.insert(pos, container) for r in appendtag.findAll(attrs={'class': 'galStr'}): r.extract() for r in appendtag.findAll(attrs={'alt': 'Następne'}): r.extract() for r in appendtag.findAll(attrs={'alt': 'Poprzednie'}): r.extract() for r in appendtag.findAll(attrs={'class': 'clra'}): r.extract() for r in appendtag.findAll(attrs={'class': 'close'}): r.extract() for r in appendtag.findAll(attrs={'class': 'tagi'}): r.extract() for r in appendtag.findAll(attrs={'id': 'moreTopNews'}): r.extract() def preprocess_html(self, soup): self.append_page(soup, soup.body) return soup